diff --git a/ci/test_integration.sh b/ci/test_integration.sh
index cdcc1c442ca..5bbd7aa6224 100755
--- a/ci/test_integration.sh
+++ b/ci/test_integration.sh
@@ -31,6 +31,9 @@ echo "=== ASR (backend=pytorch, model=rnn-no-ctc) ==="
 echo "=== ASR (backend=pytorch, model=transformer) ==="
 ./run.sh --stage 4 --train-config conf/train_transformer.yaml \
          --decode-config conf/decode.yaml
+echo "=== ASR (backend=pytorch, model=conformer) ==="
+./run.sh --stage 4 --train-config conf/train_conformer.yaml \
+         --decode-config conf/decode.yaml
 echo "=== ASR (backend=pytorch, model=transformer-pure-ctc) ==="
 ./run.sh --stage 4 --train-config conf/train_transformer_pure_ctc.yaml \
        --decode-config conf/decode_pure_ctc.yaml
@@ -94,7 +97,7 @@ done
 for t in ${feats_types}; do
     for t2 in ${token_types}; do
         echo "==== feats_type=${t}, token_types=${t2} ==="
-        ./run.sh --ngpu 0 --stage 6 --stop-stage 100 --feats-type "${t}" --token-type "${t2}" \
+        ./run.sh --ngpu 0 --stage 6 --stop-stage 13 --feats-type "${t}" --token-type "${t2}" \
             --asr-args "--max_epoch=1" --lm-args "--max_epoch=1"
     done
 done
@@ -109,7 +112,7 @@ echo "==== [ESPnet2] TTS ==="
 feats_types="raw fbank stft"
 for t in ${feats_types}; do
     echo "==== feats_type=${t} ==="
-    ./run.sh --ngpu 0 --stage 2 --stop-stage 100 --feats-type "${t}" --train-args "--max_epoch 1"
+    ./run.sh --ngpu 0 --stage 2 --stop-stage 8 --feats-type "${t}" --train-args "--max_epoch 1"
 done
 # Remove generated files in order to reduce the disk usage
 rm -rf exp dump data
diff --git a/egs/README.md b/egs/README.md
index 7375448fc4d..153a1535926 100644
--- a/egs/README.md
+++ b/egs/README.md
@@ -59,4 +59,5 @@ See: https://espnet.github.io/espnet/tutorial.html
 | voxforge                | VoxForge                                                     | ASR                                        | 7 languages    | http://www.voxforge.org/                                     |                               |
 | wsj                     | CSR-I (WSJ0) Complete, CSR-II (WSJ1) Complete                | ASR                                        | EN             | https://catalog.ldc.upenn.edu/LDC93S6A,https://catalog.ldc.upenn.edu/LDC94S13A |                               |
 | wsj_mix                 | MERL WSJ0-mix multi-speaker dataset                          | Multispeaker ASR                           | EN             | http://www.merl.com/demos/deep-clustering                    |                               |
-| yesno                   | The "yesno" corpus                                           | ASR                                        | HE             | http://www.openslr.org/1                                     |                               |
+| yesno                   | The "yesno" corpus                                           | ASR                                        | HE             | http://www.openslr.org/1                                     |                               
+| Yoloxóchitl-Mixtec      | The Yoloxóchitl-Mixtec corpus                                | ASR                                        | Mixtec         | http://www.openslr.org/89                                    ||
diff --git a/egs/aishell/asr1/RESULTS.md b/egs/aishell/asr1/RESULTS.md
index 91d799cdc10..2d0ebb77f98 100644
--- a/egs/aishell/asr1/RESULTS.md
+++ b/egs/aishell/asr1/RESULTS.md
@@ -1,3 +1,16 @@
+# Conformer result
+
+- training config file: `conf/tuning/train_pytorch_conformer.yaml`
+- decoding config file: `conf/decode.yaml`
+```
+exp/train_sp_pytorch_train_pytorch_conformer/decode_dev_decode_pytorch_conformer/result.txt
+|   SPKR     |   # Snt      # Wrd   |   Corr        Sub        Del        Ins        Err      S.Err   |
+|   Sum/Avg  |  14326      205341   |   94.7        5.1        0.1        0.1        5.4       39.0   |
+exp/train_sp_pytorch_train_pytorch_conformer/decode_test_decode_pytorch_conformer/result.txt
+|   SPKR     |   # Snt      # Wrd   |   Corr        Sub         Del        Ins        Err      S.Err   |
+|   Sum/Avg  |   7176      104765   |   94.2        5.6         0.2        0.1        5.9       41.8   |
+```
+
 # Transformer result (default transformer with initial learning rate = 1.0 and epochs = 50)
 
   - Environments (obtained by `$ get_sys_info.sh`)
@@ -64,3 +77,4 @@ exp/train_sp_pytorch_train_pytorch_transformer_lr1.0/decode_test_decode_pytorch_
 |   SPKR     |   # Snt      # Wrd   |   Corr        Sub        Del        Ins        Err      S.Err   |
 |   Sum/Avg  |   7176       104765  |   92.7        7.1        0.2        0.1        7.4      49.8    |
 ```
+
diff --git a/egs/aishell/asr1/conf/tuning/train_pytorch_conformer.yaml b/egs/aishell/asr1/conf/tuning/train_pytorch_conformer.yaml
new file mode 100644
index 00000000000..50d44abb5ab
--- /dev/null
+++ b/egs/aishell/asr1/conf/tuning/train_pytorch_conformer.yaml
@@ -0,0 +1,47 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 50
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 1.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 31
diff --git a/egs/mini_an4/asr1/conf/train_conformer.yaml b/egs/mini_an4/asr1/conf/train_conformer.yaml
new file mode 100644
index 00000000000..2ed64ae935d
--- /dev/null
+++ b/egs/mini_an4/asr1/conf/train_conformer.yaml
@@ -0,0 +1,47 @@
+# network architecture
+# encoder related
+elayers: 2
+eunits: 32
+# decoder related
+dlayers: 2
+dunits: 32
+# attention related
+adim: 16
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 2
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 3
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 5.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 31
diff --git a/egs/wsj/asr1/local/filtering_samples.py b/egs/wsj/asr1/local/filtering_samples.py
index d2ad3dff7cb..ef421e39279 100755
--- a/egs/wsj/asr1/local/filtering_samples.py
+++ b/egs/wsj/asr1/local/filtering_samples.py
@@ -50,7 +50,7 @@
     args = parser.parse_args(cmd_args)
 
     # subsampling info
-    if args.etype.startswith("vgg"):
+    if hasattr(args, "etype") and args.etype.startswith("vgg"):
         # Subsampling is not performed for vgg*.
         # It is performed in max pooling layers at CNN.
         min_io_ratio = 4
diff --git a/egs/yoloxochitl_mixtec/asr1/RESULTS.md b/egs/yoloxochitl_mixtec/asr1/RESULTS.md
new file mode 100644
index 00000000000..a5329bc7704
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/RESULTS.md
@@ -0,0 +1,36 @@
+# RESULTS (100 epoch using single GPU)
+## Environments
+- date: `Thu Jun 25 23:13:00 EDT 2020`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.5.2`
+- chainer version: `chainer 6.0.0`
+- pytorch version: `pytorch 1.1.0`
+
+## Pre-trained Model
+- Model files (archived to model.tar.gz by `$ pack_model.sh`)
+  - model link: https://drive.google.com/file/d/1daXJp3mpvOKYYuEcgNbIDRyp16Q0gjFg/view?usp=sharing
+  - training config file: `conf/train.yaml`
+  - decoding config file: `conf/decode.yaml`
+  - cmvn file: `data/train_mixtec_surface_reserve/cmvn.ark`
+  - e2e file: `exp/train_mixtec_surface_reserve_pytorch_mixtec_surface_reserve/results/model.last10.avg.best`
+  - e2e JSON file: `exp/train_mixtec_surface_reserve_pytorch_mixtec_surface_reserve/results/model.json`
+  - lm file: `exp/train_rnnlm_pytorch_mixtec_surface_reserve_unigram150/rnnlm.model.best`
+  - lm JSON file: `exp/train_rnnlm_pytorch_mixtec_surface_reserve_unigram150/model.json`
+  - dict file: `data/lang_char`
+
+
+## train_mixtec_surface_reserve_pytorch_mixtec_surface_reserve
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_dev_mixtec_surface_reserve_decode_mixtec_surface_reserve|10218|687420|89.6|6.0|4.5|2.7|13.2|87.8|
+|decode_test_mixtec_surface_reserve_decode_mixtec_surface_reserve|10112|688918|89.7|5.9|4.4|2.7|13.0|87.9|
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_dev_mixtec_surface_reserve_decode_mixtec_surface_reserve|10218|165748|80.3|15.6|4.1|3.2|22.9|87.8|
+|decode_test_mixtec_surface_reserve_decode_mixtec_surface_reserve|10112|166168|80.5|15.5|4.1|3.2|22.7|87.9|
+
diff --git a/egs/yoloxochitl_mixtec/asr1/cmd.sh b/egs/yoloxochitl_mixtec/asr1/cmd.sh
new file mode 100644
index 00000000000..4d70c9c7a79
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/decode.yaml b/egs/yoloxochitl_mixtec/asr1/conf/decode.yaml
new file mode 120000
index 00000000000..1f358f011d4
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_pytorch_transformer.yaml
\ No newline at end of file
diff --git a/egs2/TEMPLATE/tts1/conf/fbank.conf b/egs/yoloxochitl_mixtec/asr1/conf/fbank.conf
similarity index 100%
rename from egs2/TEMPLATE/tts1/conf/fbank.conf
rename to egs/yoloxochitl_mixtec/asr1/conf/fbank.conf
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/gpu.conf b/egs/yoloxochitl_mixtec/asr1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/lm.yaml b/egs/yoloxochitl_mixtec/asr1/conf/lm.yaml
new file mode 100644
index 00000000000..c2b12238482
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/conf/lm.yaml
@@ -0,0 +1,8 @@
+layer: 2
+unit: 650
+opt: sgd        # or adam
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+batchsize: 64   # batch size in LM training
+epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+maxlen: 100     # if sentence length > lm_maxlen, lm_batchsize is automatically reduced
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/pitch.conf b/egs/yoloxochitl_mixtec/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/queue.conf b/egs/yoloxochitl_mixtec/asr1/conf/queue.conf
new file mode 100644
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/slurm.conf b/egs/yoloxochitl_mixtec/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..cefd21f031d
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/conf/slurm.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/train.yaml b/egs/yoloxochitl_mixtec/asr1/conf/train.yaml
new file mode 120000
index 00000000000..4a8f9de8969
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_pytorch_transformer.yaml
\ No newline at end of file
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/tuning/decode_pytorch_transformer.yaml b/egs/yoloxochitl_mixtec/asr1/conf/tuning/decode_pytorch_transformer.yaml
new file mode 100644
index 00000000000..2dc6a0645fe
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/conf/tuning/decode_pytorch_transformer.yaml
@@ -0,0 +1,7 @@
+batchsize: 0
+beam-size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.5
+lm-weight: 0.3
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/tuning/decode_rnn.yaml b/egs/yoloxochitl_mixtec/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..493ff92dad3
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm-weight: 0.3
+beam-size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.6
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/tuning/train_pytorch_transformer.yaml b/egs/yoloxochitl_mixtec/asr1/conf/tuning/train_pytorch_transformer.yaml
new file mode 100644
index 00000000000..6e9f64b06d2
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/conf/tuning/train_pytorch_transformer.yaml
@@ -0,0 +1,40 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 100
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 1.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/tuning/train_rnn.yaml b/egs/yoloxochitl_mixtec/asr1/conf/tuning/train_rnn.yaml
new file mode 100644
index 00000000000..0192eba4e08
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/conf/tuning/train_rnn.yaml
@@ -0,0 +1,32 @@
+# network architecture
+# encoder related
+etype: vggblstm     # encoder architecture type
+elayers: 3
+eunits: 1024
+eprojs: 1024
+subsample: "1_2_2_1_1" # skip every n frame from input to nth layers
+# decoder related
+dlayers: 2
+dunits: 1024
+# attention related
+atype: location
+adim: 1024
+aconv-chans: 10
+aconv-filts: 100
+
+# hybrid CTC/attention
+mtlalpha: 0.5
+
+# minibatch related
+batch-size: 30
+maxlen-in: 800  # if input length  > maxlen_in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen_out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: adadelta
+epochs: 15
+patience: 3
+
+# scheduled sampling option
+sampling-probability: 0.0
diff --git a/egs/yoloxochitl_mixtec/asr1/local/data_prep.py b/egs/yoloxochitl_mixtec/asr1/local/data_prep.py
new file mode 100755
index 00000000000..d63e32e3259
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/local/data_prep.py
@@ -0,0 +1,540 @@
+# -*- coding: UTF-8 -*-
+
+from argparse import ArgumentParser
+import os
+import re
+import shutil
+import string
+import sys
+from xml.dom.minidom import parse
+
+s = "".join(chr(c) for c in range(sys.maxunicode + 1))
+ws = "".join(re.findall(r"\s", s))
+outtab = " " * len(ws)
+trantab = str.maketrans(ws, outtab)
+delset = string.punctuation
+delset = delset.replace(":", "")
+delset = delset.replace("'", "")
+
+
+def TextRefine(text, text_format):
+    text = re.sub(r"\.\.\.|\*|\[.*?\]", "", text.upper())
+    delset_specific = delset
+    if text_format == "underlying_full":
+        remove_clear = "()=-"
+        for char in remove_clear:
+            delset_specific = delset_specific.replace(char, "")
+    return text.translate(str.maketrans("", "", delset_specific))
+
+
+def ExtractAudioID(audioname, wav_spk_info=None):
+    if wav_spk_info:
+        for key in wav_spk_info.keys():
+            if key in audioname:
+                return key
+    else:
+        print("ERROR in audioname")
+    return "error"
+
+
+def XMLRefine(input_xml, output_xml, readable=False):
+
+    if readable:
+        append = "\n"
+    else:
+        append = ""
+    sample_processing = open(input_xml, "r", encoding="iso-8859-1")
+    output = open(output_xml, "w", encoding="iso-8859-1")
+    stack = ""
+    stack_symbol = ""
+    while True:
+        line = sample_processing.readline()
+        if not line:
+            break
+        line = line.strip()
+
+        if stack != "":
+            if len(line) > 1 and line[0] == "<":
+                if line.startswith("<Who"):
+                    continue
+                stack += "%s</%s>%s" % (append, stack_symbol, append)
+                output.write(stack)
+                if line[-2:] == "/>":
+                    stack = line[:-2] + ">" + append
+                    stack_symbol = line[1:].split(" ")[0]
+                else:
+                    output.write(line + append)
+                    stack, stack_symbol = "", ""
+            else:
+                stack += line
+        elif len(line) > 2 and line[-2:] == "/>":
+            stack += line[:-2] + ">" + append
+            stack_symbol = line[1:].split(" ")[0]
+        else:
+            output.write(line + append)
+    sample_processing.close()
+    output.close()
+
+
+def XMLProcessing(transcribe):
+    DOMTree = parse(transcribe)
+    trans = DOMTree.documentElement
+
+    # get audio file information
+    if trans.hasAttribute("audio_filename"):
+        audio_filename = trans.getAttribute("audio_filename")
+    else:
+        print("audio_file error for %s" % transcribe)
+        return
+
+    # get speaker information
+    speaker_info = {}
+    speakers = trans.getElementsByTagName("Speakers")
+    if len(speakers) < 1:
+        print("no speaker found, deal individually")
+    else:
+        speakers = speakers[0]
+        for speaker in speakers.childNodes:
+            speaker_info[speaker.getAttribute("id")] = {
+                "name": speaker.getAttribute("name"),
+                "dialect": speaker.getAttribute("dialect"),
+                "scope": speaker.getAttribute("scope"),
+                "accent": speaker.getAttribute("accent"),
+            }
+
+    # process by episode/section/turn/sync
+    xml_text = []
+    episodes = trans.getElementsByTagName("Episode")
+    for episode in episodes:
+        sections = episode.getElementsByTagName("Section")
+        for section in sections:
+            section_list = []
+            turns = section.getElementsByTagName("Turn")
+            for turn in turns:
+                # read as individual speech
+                raw_speaker = turn.getAttribute("speaker")
+                if len(raw_speaker.split(" ")) > 1:
+                    # print("continue in %s" % audio_filename)
+                    continue
+                individual_speech = raw_speaker not in speaker_info.keys()
+
+                syncs = turn.getElementsByTagName("Sync")
+                comments = turn.getElementsByTagName("Comment")
+
+                # remove comment nodes
+                for child in comments:
+                    turn.removeChild(child)
+                first, last = turn.firstChild, turn.lastChild
+
+                for sync in syncs:
+                    text = sync.firstChild
+                    if text is None:
+                        if last.isSameNode(sync) and len(syncs) > 0:
+                            section_list[-1].append(sync.getAttribute("time"))
+                        continue
+                    elif len(sync.childNodes) > 1:
+                        # for combination of speakers
+                        text = ""
+                        for child in sync.childNodes:
+                            text += child.data
+                    else:
+                        text = text.data
+
+                    text = TextRefine(text)
+
+                    text = text.translate(trantab)
+                    start_time = sync.getAttribute("time")
+
+                    if individual_speech:
+                        spk = "None"
+                    else:
+                        spk = turn.getAttribute("speaker")
+
+                    if first.isSameNode(sync):
+                        if last.isSameNode(sync):
+                            section_list.append(
+                                [spk, text, start_time, turn.getAttribute("endTime")]
+                            )
+                        else:
+                            section_list.append([spk, text, start_time])
+                    else:
+                        section_list[-1].append(start_time)
+                        if last.isSameNode(sync):
+                            section_list.append(
+                                [spk, text, start_time, turn.getAttribute("endTime")]
+                            )
+                        else:
+                            section_list.append([spk, text, start_time])
+            xml_text.extend(section_list)
+    for xml in xml_text:
+        if len(xml) != 4:
+            print("warning")
+            print(xml)
+    return audio_filename, speaker_info, xml_text
+
+
+def PackZero(number, size=6):
+    return "0" * (size - len(str(number))) + str(number)
+
+
+def LoadWavSpeakerInfo(info_file):
+    """return dict of wav: spk_list"""
+
+    info_file = open(info_file, "r", encoding="utf-8")
+    raw_info = list(map((lambda x: x.split(",")), (info_file.read()).split("\n")))
+    wav_spk_info = {}
+    for mapping in raw_info:
+        if len(mapping) < 3:
+            continue
+        [wav, spk1, spk2] = mapping
+        wav_spk_info[wav] = [spk1]
+        if spk2 != "":
+            wav_spk_info[wav] += [spk2]
+    return wav_spk_info
+
+
+def LoadSpeakerDetails(speaker_details):
+    spk_details = {}
+    spk_file = open(speaker_details, "r", encoding="utf-8")
+    content = spk_file.read()
+    last_names = re.findall(r"\\last_name (.*?)\n", content, re.S)
+    first_names = re.findall(r"\\first_name (.*?)\n", content, re.S)
+    codes = re.findall(r"\\code (.*?)\n", content, re.S)
+    assert len(last_names) == len(first_names) == len(codes)
+    for last, first, code in zip(last_names, first_names, codes):
+        spk_details["%s %s" % (" ".join(first.split()), " ".join(last.split()))] = code
+    return spk_details
+
+
+def TimeOrderProcess(time_order_dom):
+    time_order = {}
+    time_slots = time_order_dom.getElementsByTagName("TIME_SLOT")
+    for time_slot in time_slots:
+        # convert to second based
+        time_order[time_slot.getAttribute("TIME_SLOT_ID")] = (
+            float(time_slot.getAttribute("TIME_VALUE")) / 1000
+        )
+    return time_order
+
+
+def ELANProcess(afile, spk_info, spk_details, text_format):
+    try:
+        elan_content = parse(afile).documentElement
+    except Exception:
+        print("encoding failed  %s" % afile)
+        return None
+    time_order = TimeOrderProcess(elan_content.getElementsByTagName("TIME_ORDER")[0])
+    tiers = elan_content.getElementsByTagName("TIER")
+    channels = ([], [])
+    for tier in tiers:
+        if tier.getAttribute("LINGUISTIC_TYPE_REF") not in [
+            "UtteranceType",
+            "Transcription",
+        ]:
+            # only consider pure caption
+            continue
+        try:
+            spk_name = " ".join(tier.getAttribute("TIER_ID").strip().split())
+            if text_format == "surface":
+                if "SURFACE" not in spk_name:
+                    continue
+                code = spk_details[spk_name[:-9]]
+            else:
+                if "SURFACE" in spk_name:
+                    continue
+                code = spk_details[spk_name]
+        except Exception:
+            print("error speaker: %s" % tier.getAttribute("TIER_ID").strip())
+            continue
+        if code not in spk_info:
+            continue
+        channel = channels[spk_info.index(code)]
+        annotations = tier.getElementsByTagName("ANNOTATION")
+        for anno in annotations:
+            info = anno.getElementsByTagName("ALIGNABLE_ANNOTATION")[0]
+            start = time_order[info.getAttribute("TIME_SLOT_REF1")]
+            end = time_order[info.getAttribute("TIME_SLOT_REF2")]
+            text = ""
+            childs = info.getElementsByTagName("ANNOTATION_VALUE")[0].childNodes
+            for child in childs:
+                if child.firstChild is not None:
+                    continue
+                    text += child.firstChild.data
+                else:
+                    text += child.data
+            text = TextRefine(text, text_format)
+            text = text.translate(trantab)
+            if len(text) < 1:
+                continue
+            if start == end:
+                continue
+            channel.append([start, end, text])
+    return channels
+
+
+def TraverseData(
+    sound_dir,
+    annotation_dir,
+    target_dir,
+    mode,
+    speaker_info,
+    new_data_dir,
+    speaker_details,
+    text_format,
+):
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+
+    segments = open(os.path.join(target_dir, "segments"), "w", encoding="utf-8")
+    wavscp = open(os.path.join(target_dir, "wav.scp"), "w", encoding="utf-8")
+    utt2spk = open(os.path.join(target_dir, "utt2spk"), "w", encoding="utf-8")
+    spk2utt = open(os.path.join(target_dir, "spk2utt"), "w", encoding="utf-8")
+    text = open(os.path.join(target_dir, "text"), "w", encoding="utf-8")
+    name2spk = open(os.path.join(target_dir, "name2spk"), "w", encoding="utf-8")
+    remix_script = open(
+        os.path.join(target_dir, "remix_script.sh"), "w", encoding="utf-8"
+    )
+
+    # get relationship
+    sound_files = {}
+    annotation_files = {}
+    spk_id = 1
+    spk2utt_prep = {}
+    name2spk_prep = {}
+
+    if mode == "trs":
+        if not os.path.exists(os.path.join(target_dir, "temp")):
+            os.mkdir(os.path.join(target_dir, "temp"))
+        audio_set = set()
+        for root, dirs, files in os.walk(sound_dir):
+            for file in files:
+                if file[-4:] == ".wav":
+                    sound_files[ExtractAudioID(file)] = os.path.join(root, file)
+        for root, dirs, files in os.walk(annotation_dir):
+            for file in files:
+                if file[-4:] == ".trs":
+                    XMLRefine(
+                        os.path.join(root, file), os.path.join(target_dir, "temp", file)
+                    )
+                    annotation_files[file] = os.path.join(target_dir, "temp", file)
+        for afile in annotation_files.keys():
+            if afile == "error":
+                continue
+            try:
+                audio_name, speakers, segment_info = XMLProcessing(
+                    annotation_files[afile]
+                )
+            except Exception:
+                print("error process %s" % annotation_files[afile])
+            audio_name = audio_name.replace(" ", "")
+            audio_name = ExtractAudioID(audio_name)
+            if audio_name in audio_set:
+                continue
+            audio_set.add(audio_name)
+            if "%s.wav" % audio_name not in sound_files.keys():
+                print("no audio found for annotation: %s" % afile)
+                continue
+                # write wav.scp & segments & text files
+            print(
+                "%s sox -t wavpcm %s -c 1 -r 16000 -t wavpcm - |"
+                % (audio_name, sound_files["%s.wav" % audio_name]),
+                file=wavscp,
+            )
+            segment_number = 1
+            temp_speaker_id = {}
+            for speaker in speakers.keys():
+                name2spk_prep[speakers[speaker]["name"]] = name2spk_prep.get(
+                    speakers[speaker]["name"], spk_id
+                )
+                temp_speaker_id[speaker] = name2spk_prep[speakers[speaker]["name"]]
+                if name2spk_prep[speakers[speaker]["name"]] == spk_id:
+                    print(
+                        "%s %s" % (speakers[speaker]["name"], PackZero(spk_id)),
+                        file=name2spk,
+                    )
+                    spk_id += 1
+            for segment in segment_info:
+                # segment: [spk, text, start_time, end_time]
+                if segment[0] == "None":
+                    spk = spk_id
+                    spk_id += 1
+                else:
+                    spk = temp_speaker_id[segment[0]]
+                segment_id = "%s_%s_%s" % (
+                    PackZero(spk),
+                    audio_name,
+                    PackZero(segment_number),
+                )
+
+                # skip data error
+                skip = False
+                for seg in segment:
+                    if len(seg) < 1:
+                        print("warning segment %s in %s" % (segment_id, audio_name))
+                        skip = True
+                if skip:
+                    continue
+
+                print(
+                    "%s %s %s %s" % (segment_id, audio_name, segment[2], segment[3]),
+                    file=segments,
+                )
+                print("%s %s" % (segment_id, PackZero(spk)), file=utt2spk)
+                print("%s %s" % (segment_id, segment[1]), file=text)
+
+                spk2utt_prep[spk] = spk2utt_prep.get(spk, "") + " %s" % (segment_id)
+                segment_number += 1
+            for spk in spk2utt_prep.keys():
+                print("%s %s" % (spk, spk2utt_prep[spk]), file=spk2utt)
+            print("successfully processing %s" % afile)
+        shutil.rmtree(os.path.join(target_dir, "temp"))
+    else:
+        wav_spk_info = LoadWavSpeakerInfo(speaker_info)
+        spk_details = LoadSpeakerDetails(speaker_details)
+        for root, dirs, files in os.walk(sound_dir):
+            for file in files:
+                if file[-4:] == ".wav":
+                    sound_files[ExtractAudioID(file, wav_spk_info)] = os.path.join(
+                        root, file
+                    )
+        for root, dirs, files in os.walk(annotation_dir):
+            for file in files:
+                if file[-4:] == ".eaf":
+                    annotation_files[ExtractAudioID(file, wav_spk_info)] = os.path.join(
+                        root, file
+                    )
+        for afile in annotation_files.keys():
+            afile_path = annotation_files[afile]
+            if afile == "error":
+                continue
+            spk_info = wav_spk_info[afile]
+            segment_info = ELANProcess(afile_path, spk_info, spk_details, text_format)
+            if segment_info is None:
+                continue
+            left_channel_segments, right_channel_segments = segment_info
+
+            print(
+                'sox -t wavpcm "%s" -c 1 -r 16000 -t wavpcm %s-L.wav remix 1'
+                % (sound_files[afile], os.path.join(new_data_dir, afile)),
+                file=remix_script,
+            )
+
+            print(
+                "%s-L %s-L.wav" % (afile, os.path.join(new_data_dir, afile)),
+                file=wavscp,
+            )
+            segment_number = 0
+            for segment in left_channel_segments:
+                # segments: start end text
+                segment_id = "%s_%s-L_%s" % (
+                    spk_info[0],
+                    afile,
+                    PackZero(segment_number),
+                )
+                print(
+                    "%s %s-L %s %s" % (segment_id, afile, segment[0], segment[1]),
+                    file=segments,
+                )
+                print("%s %s" % (segment_id, spk_info[0]), file=utt2spk)
+                print("%s %s" % (segment_id, segment[2]), file=text)
+                spk2utt_prep[spk_info[0]] = spk2utt_prep.get(
+                    spk_info[0], ""
+                ) + " %s" % (segment_id)
+                segment_number += 1
+
+            if len(right_channel_segments) > 0:
+                print(
+                    'sox -t wavpcm "%s" -c 1 -r 16000 -t wavpcm %s-R.wav remix 2'
+                    % (sound_files[afile], os.path.join(new_data_dir, afile)),
+                    file=remix_script,
+                )
+                print(
+                    "%s-R %s-R.wav" % (afile, os.path.join(new_data_dir, afile)),
+                    file=wavscp,
+                )
+                for segment in right_channel_segments:
+                    # segments: start end text
+                    segment_id = "%s_%s-R_%s" % (
+                        spk_info[1],
+                        afile,
+                        PackZero(segment_number),
+                    )
+                    print(
+                        "%s %s-R %s %s" % (segment_id, afile, segment[0], segment[1]),
+                        file=segments,
+                    )
+                    print("%s %s" % (segment_id, spk_info[1]), file=utt2spk)
+                    print("%s %s" % (segment_id, segment[2]), file=text)
+                    spk2utt_prep[spk_info[1]] = spk2utt_prep.get(
+                        spk_info[1], ""
+                    ) + " %s" % (segment_id)
+                    segment_number += 1
+            print("successfully processing %s" % afile)
+        for spk in spk2utt_prep.keys():
+            print("%s %s" % (spk, spk2utt_prep[spk]), file=spk2utt)
+    segments.close()
+    wavscp.close()
+    utt2spk.close()
+    spk2utt.close()
+    text.close()
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Process Raw data")
+    parser.add_argument(
+        "-w", dest="wav_path", type=str, help="wav path", default="",
+    )
+    parser.add_argument(
+        "-a", dest="ann_path", type=str, help="annotation path", default="",
+    )
+    parser.add_argument(
+        "-t", dest="target_dir", type=str, help="target_dir", default="data/mixtec"
+    )
+    parser.add_argument(
+        "-i",
+        dest="speaker_info",
+        type=str,
+        help="speaker info file dir",
+        default="local/speaker_wav_mapping_mixtec.csv",
+    )
+    parser.add_argument(
+        "-m",
+        dest="mode",
+        type=str,
+        help="transcription type",
+        default="eaf",
+        choices=["eaf", "trs"],
+    )
+    parser.add_argument(
+        "-n",
+        dest="new_data_dir",
+        type=str,
+        help="new data directory",
+        default="remixed",
+    )
+    parser.add_argument(
+        "-d",
+        dest="speaker_details",
+        type=str,
+        help="speaker details (i.e. names to code)",
+        default="local/Mixtec-consultant-database-unicode_2019-12-25.txt",
+    )
+    parser.add_argument(
+        "-f",
+        dest="text_format",
+        type=str,
+        help="text format",
+        default="",
+        choices=["surface", "underlying_full", "underlying_reduced", ""],
+    )
+    args = parser.parse_args()
+    TraverseData(
+        args.wav_path,
+        args.ann_path,
+        args.target_dir,
+        mode=args.mode,
+        speaker_info=args.speaker_info,
+        new_data_dir=args.new_data_dir,
+        speaker_details=args.speaker_details,
+        text_format=args.text_format,
+    )
diff --git a/egs/yoloxochitl_mixtec/asr1/local/download_and_untar.sh b/egs/yoloxochitl_mixtec/asr1/local/download_and_untar.sh
new file mode 100755
index 00000000000..c882b6acd50
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/local/download_and_untar.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Luminar Technologies, Inc. (author: Daniel Galvez)
+#             2017  Ewald Enzinger
+# Apache 2.0
+
+# Adapted from egs/mini_librispeech/s5/local/download_and_untar.sh (commit 1cd6d2ac3a935009fdc4184cb8a72ddad98fe7d9)
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url> <filename>"
+  echo "e.g.: $0 /export/data/ https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz cv_corpus_v1.tar.gz"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+fi
+
+data=$1
+url=$2
+filename=$3
+filepath="$data/$filename"
+workspace=$PWD
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL."
+  exit 1;
+fi
+
+if [ -f $data/$filename.complete ]; then
+  echo "$0: data was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+
+if [ -f $filepath ]; then
+  size=$(/bin/ls -l $filepath | awk '{print $5}')
+  size_ok=false
+  if [ "$filesize" -eq "$size" ]; then size_ok=true; fi;
+  if ! $size_ok; then
+    echo "$0: removing existing file $filepath because its size in bytes ($size)"
+    echo "does not equal the size of the archives ($filesize)."
+    rm $filepath
+  else
+    echo "$filepath exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $filepath ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  echo "$0: downloading data from $url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $url; then
+    echo "$0: error executing wget $url"
+    exit 1;
+  fi
+  cd $workspace
+fi
+
+cd $data
+
+if ! tar -xzf $filename; then
+  echo "$0: error un-tarring archive $filepath"
+  exit 1;
+fi
+
+cd $workspace
+
+touch $data/$filename.complete
+
+echo "$0: Successfully downloaded and un-tarred $filepath"
+
+if $remove_archive; then
+  echo "$0: removing $filepath file since --remove-archive option was supplied."
+  rm $filepath
+fi
diff --git a/egs/yoloxochitl_mixtec/asr1/local/split_tr_dt_et.py b/egs/yoloxochitl_mixtec/asr1/local/split_tr_dt_et.py
new file mode 100644
index 00000000000..fc459d5d30a
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/local/split_tr_dt_et.py
@@ -0,0 +1,61 @@
+import argparse
+import os
+
+
+def gen_new_segments(datadir, spk_list):
+    if not os.path.isfile(os.path.join(datadir, "segments")):
+        raise ValueError("no segments file found in datadir")
+
+    new_segments = open(os.path.join(datadir, "new_segments"), "w", encoding="utf-8")
+    segments = open(os.path.join(datadir, "segments"), "r", encoding="utf-8")
+    while True:
+        line = segments.readline()
+        if not line:
+            break
+        spk = line.split("_")[0]
+        if spk in spk_list:
+            new_segments.write(line)
+    new_segments.close(), segments.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--source", "-s", type=str, help="source data directory")
+    parser.add_argument("--conf", "-c", type=str, help="split by speaker")
+    parser.add_argument("--tag", "-t", type=str, help="the prefix of data spit result")
+    parser.add_argument(
+        "--train", type=str, default="", help="specific name for training dir"
+    )
+    parser.add_argument("--dev", type=str, default="", help="specific name for dev dir")
+    parser.add_argument(
+        "--test", type=str, default="", help="specific name for test dir"
+    )
+    args = parser.parse_args()
+
+    with open(args.conf, "r", encoding="utf-8") as f:
+        f_content = f.read().strip().split("\n")
+        split_info = {}
+        for line in f_content:
+            line = line.split(",")
+            split_info[line[0]] = line[1:]
+
+    # construct dataset
+    train_dir = (
+        "data/train_{}".format(args.source)
+        if args.train == ""
+        else "data/{}".format(args.train)
+    )
+    test_dir = (
+        "data/test_{}".format(args.source)
+        if args.test == ""
+        else "data/{}".format(args.test)
+    )
+    dev_dir = (
+        "data/dev_{}".format(args.source)
+        if args.dev == ""
+        else "data/{}".format(args.dev)
+    )
+
+    gen_new_segments(train_dir, split_info["train"])
+    gen_new_segments(test_dir, split_info["test"])
+    gen_new_segments(dev_dir, split_info["dev"])
diff --git a/egs/yoloxochitl_mixtec/asr1/local/split_tr_dt_et.sh b/egs/yoloxochitl_mixtec/asr1/local/split_tr_dt_et.sh
new file mode 100755
index 00000000000..529ee39c793
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/local/split_tr_dt_et.sh
@@ -0,0 +1,26 @@
+. utils/parse_options.sh
+
+if [ $# != 5 ]; then
+    echo "Usage: $0 <src-data-dir> <dest-trdata-dir> <dest-dtdata-dir> <dest-etdata-dir> <spk_list_conf>>";
+    exit 1;
+fi
+
+sdata=$1
+trdata=$2
+dtdata=$3
+etdata=$4
+spk_list=$5
+
+# get a temp dir
+./utils/copy_data_dir.sh data/${sdata} data/${trdata}
+./utils/copy_data_dir.sh data/${sdata} data/${dtdata}
+./utils/copy_data_dir.sh data/${sdata} data/${etdata}
+
+python ./local/split_tr_dt_et.py -s ${trdata} -c ${spk_list} --train ${trdata} --test ${etdata} --dev ${dtdata}
+mv data/${trdata}/new_segments data/${trdata}/segments
+mv data/${etdata}/new_segments data/${etdata}/segments
+mv data/${dtdata}/new_segments data/${dtdata}/segments
+
+./utils/fix_data_dir.sh data/${trdata}
+./utils/fix_data_dir.sh data/${etdata}
+./utils/fix_data_dir.sh data/${dtdata}
diff --git a/egs/yoloxochitl_mixtec/asr1/path.sh b/egs/yoloxochitl_mixtec/asr1/path.sh
new file mode 100644
index 00000000000..0f50042c7e7
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/path.sh
@@ -0,0 +1,21 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+if [ -e $MAIN_ROOT/tools/venv/etc/profile.d/conda.sh ]; then
+    source $MAIN_ROOT/tools/venv/etc/profile.d/conda.sh && conda deactivate && conda activate
+else
+    source $MAIN_ROOT/tools/venv/bin/activate
+fi
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/yoloxochitl_mixtec/asr1/run.sh b/egs/yoloxochitl_mixtec/asr1/run.sh
new file mode 100755
index 00000000000..653f51cd81d
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/run.sh
@@ -0,0 +1,251 @@
+#!/bin/bash
+
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=0        # start from 0 if you need to start from data preparation
+stop_stage=100
+ngpu=1         # number of gpus ("0" uses cpu, otherwise use gpu)
+debugmode=1
+N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0      # verbose option
+resume=        # Resume the training from snapshot
+
+# dataset related
+annotation_type=eaf
+annotation_id=mixtec_surface
+text_format=surface # underlying_full, underlying_reduce
+
+# wav and transcription data directoy
+download_dir=
+wavdir=${download_dir}/Sound-files-Narratives-for-ASR
+annodir=${download_dir}/Transcriptions-for-ASR/ELAN-files-with-underlying-and-surface-tiers
+
+# feature configuration
+do_delta=false
+
+train_config=conf/train.yaml
+lm_config=conf/lm.yaml
+decode_config=conf/decode.yaml
+
+# rnnlm related
+lm_resume=        # specify a snapshot file to resume LM training
+lmtag=${annotation_id}        # tag for managing LMs
+
+# decoding parameter
+recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+n_average=10
+
+dumpdir=dump   # directory to dump full features
+
+# bpemode (unigram or bpe)
+nbpe=150
+bpemode=unigram
+
+# exp tag
+tag=${annotation_id} # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train_${annotation_id}
+train_dev=dev_${annotation_id}
+test_set=test_${annotation_id}
+recog_set="${train_set} ${train_dev} ${test_set}"
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    local/download_and_untar.sh ${download_dir} http://www.openslr.org/resources/89/Yoloxochitl-Mixtec-Data.tgz Yoloxochitl-Mixtec-Data.tgz
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    python local/data_prep.py -w $wavdir -a $annodir -t data/${annotation_id} \
+                              -m ${annotation_type} -i local/speaker_wav_mapping_mixtec_remove_reserve.csv \
+                              -f ${text_format}
+
+    data/${annotation_id}/remix_script.sh
+
+    # Kaldi Version Split
+    # ./utils/subset_data_dir_tr_cv.sh data/${annotation_id} data/${train_set} data/recog
+    # ./utils/subset_data_dir_tr_cv.sh --cv-spk-percent 50 data/recog data/${train_dev} data/${test_set}
+
+    # ESPNet Version (same as voxforge)
+    # consider duplicated sentences (does not consider speaker split)
+    # filter out the same sentences (also same text) of test&dev set from validated set
+    local/split_tr_dt_et.sh data/${annotation_id} data/${train_set} data/${train_dev} data/${test_set}
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+    fbankdir=fbank
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    for x in ${train_set} ${train_dev} ${recog_set}; do
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 4 --write_utt2num_frames true \
+                                  data/${x} exp/make_fbank/${x} ${fbankdir}
+        utils/fix_data_dir.sh data/${x}
+    done
+    # Remove features with too long frames in training data
+    max_len=3000
+    mv data/${train_set}/utt2num_frames data/${train_set}/utt2num_frames.bak
+    awk -v max_len=${max_len} '$2 < max_len {print $1, $2}' data/${train_set}/utt2num_frames.bak > data/${train_set}/utt2num_frames
+    utils/filter_scp.pl data/${train_set}/utt2num_frames data/${train_set}/utt2spk > data/${train_set}/utt2spk.new
+    mv data/${train_set}/utt2spk.new data/${train_set}/utt2spk
+    utils/fix_data_dir.sh data/${train_set}
+
+    # compute global CMVN
+    compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
+
+    dump.sh --cmd "$train_cmd" --nj 4 --do_delta ${do_delta} \
+            data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/train ${feat_tr_dir}
+    dump.sh --cmd "$train_cmd" --nj 4 --do_delta ${do_delta} \
+            data/${train_dev}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/dev ${feat_dt_dir} 
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}; mkdir -p ${feat_recog_dir}
+        dump.sh --cmd "$train_cmd" --nj 4 --do_delta ${do_delta} \
+                data/${rtask}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/recog/${rtask} \
+                ${feat_recog_dir}
+    done
+fi
+
+dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_char/
+    echo "make a dictionary"
+    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt
+    spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+    spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+    wc -l ${dict}
+
+    echo "make json files"
+    data2json.sh --feat ${feat_tr_dir}/feats.scp --bpecode ${bpemodel}.model \
+                 data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json
+    data2json.sh --feat ${feat_dt_dir}/feats.scp --bpecode ${bpemodel}.model \
+                 data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+        data2json.sh --feat ${feat_recog_dir}/feats.scp --bpecode ${bpemodel}.model \
+                     data/${rtask} ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+    done
+fi
+
+# you can skip this and remove --rnnlm option in the recognition (stage 5)
+if [ -z ${lmtag} ]; then
+    lmtag=$(basename ${lm_config%.*})
+fi
+lmexpname=train_rnnlm_${backend}_${lmtag}_${bpemode}${nbpe}
+lmexpdir=exp/${lmexpname}
+mkdir -p ${lmexpdir}
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: LM Preparation"
+    lmdatadir=data/local/lm_train_${bpemode}${nbpe}
+    mkdir -p ${lmdatadir}
+    cut -f 2- -d" " data/${train_set}/text | spm_encode --model=${bpemodel}.model --output_format=piece > ${lmdatadir}/train.txt
+    cut -f 2- -d" " data/${train_dev}/text | spm_encode --model=${bpemodel}.model --output_format=piece > ${lmdatadir}/valid.txt
+
+    ${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \
+        lm_train.py \
+        --config ${lm_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --verbose 1 \
+        --outdir ${lmexpdir} \
+        --tensorboard-dir tensorboard/${lmexpname} \
+        --train-label ${lmdatadir}/train.txt \
+        --valid-label ${lmdatadir}/valid.txt \
+        --resume ${lm_resume} \
+        --dict ${dict}
+fi
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${backend}_$(basename ${train_config%.*})
+    if ${do_delta}; then
+        expname=${expname}_delta
+    fi
+else
+    expname=${train_set}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Network Training"
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        asr_train.py \
+        --config ${train_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --outdir ${expdir}/results \
+        --tensorboard-dir tensorboard/${expname} \
+        --debugmode ${debugmode} \
+        --dict ${dict} \
+        --debugdir ${expdir} \
+        --minibatches ${N} \
+        --verbose ${verbose} \
+        --resume ${resume} \
+        --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.json \
+        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.json
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    nj=4
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]]; then
+	recog_model=model.last${n_average}.avg.best
+	average_checkpoints.py --backend ${backend} \
+			       --snapshots ${expdir}/results/snapshot.ep.* \
+			       --out ${expdir}/results/${recog_model} \
+			       --num ${n_average}
+    fi
+    pids=() # initialize pids
+    for rtask in ${recog_set}; do
+    (
+        decode_dir=decode_${rtask}_$(basename ${decode_config%.*})_${lmtag}
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+
+        #### use CPU for decoding
+        ngpu=0
+
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            asr_recog.py \
+            --config ${decode_config} \
+            --ngpu ${ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --recog-json ${feat_recog_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${recog_model}  \
+            --rnnlm ${lmexpdir}/rnnlm.model.best
+
+        score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
+
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
diff --git a/egs/yoloxochitl_mixtec/asr1/steps b/egs/yoloxochitl_mixtec/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/yoloxochitl_mixtec/asr1/utils b/egs/yoloxochitl_mixtec/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/yoloxochitl_mixtec/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/TEMPLATE/README.md b/egs2/TEMPLATE/README.md
index 7d922532b4f..8844e364c04 100644
--- a/egs2/TEMPLATE/README.md
+++ b/egs2/TEMPLATE/README.md
@@ -23,17 +23,18 @@ ESPnet2 doesn't prepare different recipes for each corpus unlike ESPnet1, but we
     # The contents of run.sh
     ./asr.sh \
       --train_set train \
-      --dev_set dev \
-      --eval_sets "test1 test2" \
+      --valid_set dev \
+      --test_sets "dev test1 test2" \
       --srctexts "data/train/text" "$@"
     ```
     
-    We use a common recipe, thus you must absorb the difference of each corpus by the command line options of `asr.sh`.
-    We expect that `local/data.sh` generates `data/train`, `data/dev`, `data/test1` and `data/test2`, which have Kaldi style (See stage1 of `asr.sh`). 
-    
-    If you'll create your recipe from scratch, you have to understand Kaldi data structure. See the next section. 
-    
-    If you'll port the recipe from ESPnet1 or Kaldi, you need to embed the data preparation part of the original recipe in `local/data.sh`. Note that the common steps include `Feature extraction`, `Speed Perturbation`, and `Removing long/short utterances`, so you don't need to do them at `local/data.sh`
+    - We use a common recipe, thus you must absorb the difference of each corpus by the command line options of `asr.sh`.
+    - We expect that `local/data.sh` generates training data (e.g., `data/train`), validation data (e.g., `data/dev`), and (multiple) test data (e.g, `data/test1` and `data/test2`), which have Kaldi style (See stage1 of `asr.sh`). 
+    - Note that some corpora only provide the test data and would not officially prepare the development set. In this case, you can prepare the validation data by extracting the part of the training data and regard the rest of training data as a new training data by yourself (e.g., check `egs2/csj/asr1/local/data.sh`).
+    - Also, the validation data used during training must be a single data directory. If you have multiple validation data directories, you must combine them by using `utils/combine_data.sh`.
+    - On the other hand, the recipe accepts multiple test data directories during inference. So, you can include the validation data to evaluate the ASR performance of the validation data.
+    - If you'll create your recipe from scratch, you have to understand Kaldi data structure. See the next section. 
+    - If you'll port the recipe from ESPnet1 or Kaldi, you need to embed the data preparation part of the original recipe in `local/data.sh`. Note that the common steps include `Feature extraction`, `Speed Perturbation`, and `Removing long/short utterances`, so you don't need to do them at `local/data.sh`
 
    
 1. If the recipe uses some corpora and they are not listed in `db.sh`, then write it.
diff --git a/egs2/TEMPLATE/asr1/asr.sh b/egs2/TEMPLATE/asr1/asr.sh
index bf994cac5d1..fdd3ec6e7b8 100755
--- a/egs2/TEMPLATE/asr1/asr.sh
+++ b/egs2/TEMPLATE/asr1/asr.sh
@@ -23,15 +23,17 @@ min() {
 SECONDS=0
 
 # General configuration
-stage=1          # Processes starts from the specified stage.
-stop_stage=12    # Processes is stopped at the specified stage.
-ngpu=1           # The number of gpus ("0" uses cpu, otherwise use gpu).
-num_nodes=1      # The number of nodes
-nj=32            # The number of parallel jobs.
-decode_nj=32     # The number of parallel jobs in decoding.
-gpu_decode=false # Whether to perform gpu decoding.
-dumpdir=dump     # Directory to dump features.
-expdir=exp       # Directory to save experiments.
+stage=1              # Processes starts from the specified stage.
+stop_stage=12        # Processes is stopped at the specified stage.
+skip_data_prep=false # Skip data preparation stages
+skip_train=false     # Skip training stages
+ngpu=1               # The number of gpus ("0" uses cpu, otherwise use gpu).
+num_nodes=1          # The number of nodes
+nj=32                # The number of parallel jobs.
+decode_nj=32         # The number of parallel jobs in decoding.
+gpu_decode=false     # Whether to perform gpu decoding.
+dumpdir=dump         # Directory to dump features.
+expdir=exp           # Directory to save experiments.
 
 # Data preparation related
 local_data_opts= # The options given to local/data.sh.
@@ -60,6 +62,7 @@ bpe_char_cover=1.0  # character coverage when modeling BPE
 # Language model related
 use_lm=true       # Use language model for ASR decoding.
 lm_tag=           # Suffix to the result dir for language model training.
+lm_exp=           # Specify the direcotry path for LM experiment. If this option is specified, lm_tag is ignored.
 lm_config=        # Config for language model training.
 lm_args=          # Arguments for language model training, e.g., "--max_epoch 10".
                   # Note that it will overwrite args in lm config.
@@ -70,6 +73,7 @@ word_vocab_size=10000 # Size of word vocabulary.
 
 # ASR model related
 asr_tag=    # Suffix to the result dir for asr model training.
+asr_exp=    # Specify the direcotry path for ASR experiment. If this option is specified, asr_tag is ignored.
 asr_config= # Config for asr model training.
 asr_args=   # Arguments for asr model training, e.g., "--max_epoch 10".
             # Note that it will overwrite args in asr config.
@@ -91,32 +95,35 @@ decode_asr_model=valid.acc.best.pth # ASR model path for decoding.
 
 # [Task dependent] Set the datadir name created by local/data.sh
 train_set=       # Name of training set.
-dev_set=         # Name of development set.
-eval_sets=       # Names of evaluation sets. Multiple items can be specified.
+valid_set=       # Name of validation set used for monitoring/tuning network training
+test_sets=       # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
 srctexts=        # Used for the training of BPE and LM and the creation of a vocabulary list.
 lm_dev_text=     # Text file path of language model development set.
 lm_test_text=    # Text file path of language model evaluation set.
 nlsyms_txt=none  # Non-linguistic symbol list if existing.
 cleaner=none     # Text cleaner.
 g2p=none         # g2p method (needed if token_type=phn).
+lang=noinfo      # The language type of corpus
 asr_speech_fold_length=800 # fold_length for speech data during ASR training
 asr_text_fold_length=150   # fold_length for text data during ASR training
 lm_fold_length=150         # fold_length for LM training
 
 help_message=$(cat << EOF
-Usage: $0 --train-set <train_set_name> --dev-set <dev_set_name> --eval_sets <eval_set_names> --srctexts <srctexts >
+Usage: $0 --train-set <train_set_name> --valid-set <valid_set_name> --test_sets <test_set_names> --srctexts <srctexts >
 
 Options:
     # General configuration
-    --stage      # Processes starts from the specified stage (default="${stage}").
-    --stop_stage # Processes is stopped at the specified stage (default="${stop_stage}").
-    --ngpu       # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
-    --num_nodes  # The number of nodes
-    --nj         # The number of parallel jobs (default="${nj}").
-    --decode_nj  # The number of parallel jobs in decoding (default="${decode_nj}").
-    --gpu_decode # Whether to perform gpu decoding (default="${gpu_decode}").
-    --dumpdir    # Directory to dump features (default="${dumpdir}").
-    --expdir     # Directory to save experiments (default="${expdir}").
+    --stage          # Processes starts from the specified stage (default="${stage}").
+    --stop_stage     # Processes is stopped at the specified stage (default="${stop_stage}").
+    --skip_data_prep # Skip data preparation stages (default="${skip_data_prep}").
+    --skip_train     # Skip training stages (default="${skip_train}").
+    --ngpu           # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
+    --num_nodes      # The number of nodes
+    --nj             # The number of parallel jobs (default="${nj}").
+    --decode_nj      # The number of parallel jobs in decoding (default="${decode_nj}").
+    --gpu_decode     # Whether to perform gpu decoding (default="${gpu_decode}").
+    --dumpdir        # Directory to dump features (default="${dumpdir}").
+    --expdir         # Directory to save experiments (default="${expdir}").
 
     # Data preparation related
     --local_data_opts # The options given to local/data.sh (default="${local_data_opts}").
@@ -143,6 +150,7 @@ Options:
     --bpe_char_cover          # Character coverage when modeling BPE (default="${bpe_char_cover}").
     # Language model related
     --lm_tag          # Suffix to the result dir for language model training (default="${lm_tag}").
+    --lm_exp          # Specify the direcotry path for LM experiment. If this option is specified, lm_tag is ignored (default="${lm_exp}").
     --lm_config       # Config for language model training (default="${lm_config}").
     --lm_args         # Arguments for language model training, e.g., "--max_epoch 10" (default="${lm_args}").
                       # Note that it will overwrite args in lm config.
@@ -152,6 +160,7 @@ Options:
 
     # ASR model related
     --asr_tag    # Suffix to the result dir for asr model training (default="${asr_tag}").
+    --asr_exp    # Specify the direcotry path for ASR experiment. If this option is specified, asr_tag is ignored (default="${asr_exp}").
     --asr_config # Config for asr model training (default="${asr_config}").
     --asr_args   # Arguments for asr model training, e.g., "--max_epoch 10" (default="${asr_args}").
                  # Note that it will overwrite args in asr config.
@@ -168,14 +177,15 @@ Options:
 
     # [Task dependent] Set the datadir name created by local/data.sh
     --train_set     # Name of training set (required).
-    --dev_set       # Name of development set (required).
-    --eval_sets     # Names of evaluation sets (required).
+    --valid_set=    # Name of validation set used for monitoring/tuning network training (required).
+    --test_sets=    # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified (required).
     --srctexts      # Used for the training of BPE and LM and the creation of a vocabulary list (required).
     --lm_dev_text   # Text file path of language model development set (default="${lm_dev_text}").
     --lm_test_text  # Text file path of language model evaluation set (default="${lm_test_text}").
     --nlsyms_txt    # Non-linguistic symbol list if existing (default="${nlsyms_txt}").
     --cleaner       # Text cleaner (default="${cleaner}").
     --g2p           # g2p method (default="${g2p}").
+    --lang              # The language type of corpus (default=${lang}).
     --asr_speech_fold_length # fold_length for speech data during ASR training  (default="${asr_speech_fold_length}").
     --asr_text_fold_length   # fold_length for text data during ASR training  (default="${asr_text_fold_length}").
     --lm_fold_length         # fold_length for LM training  (default="${lm_fold_length}").
@@ -197,8 +207,8 @@ fi
 
 # Check required arguments
 [ -z "${train_set}" ] && { log "${help_message}"; log "Error: --train_set is required"; exit 2; };
-[ -z "${dev_set}" ] &&   { log "${help_message}"; log "Error: --dev_set is required"  ; exit 2; };
-[ -z "${eval_sets}" ] && { log "${help_message}"; log "Error: --eval_sets is required"; exit 2; };
+[ -z "${valid_set}" ] && { log "${help_message}"; log "Error: --valid_set is required"; exit 2; };
+[ -z "${test_sets}" ] && { log "${help_message}"; log "Error: --test_sets is required"; exit 2; };
 [ -z "${srctexts}" ] &&  { log "${help_message}"; log "Error: --srctexts is required" ; exit 2; };
 
 # Check feature type
@@ -217,9 +227,9 @@ else
 fi
 
 # Use the same text as ASR for lm training if not specified.
-[ -z "${lm_dev_text}" ] && lm_dev_text="${data_feats}/${dev_set}/text"
+[ -z "${lm_dev_text}" ] && lm_dev_text="${data_feats}/${valid_set}/text"
 # Use the text of the 1st evaldir if lm_test is not specified
-[ -z "${lm_test_text}" ] && lm_test_text="${data_feats}/${eval_sets%% *}/text"
+[ -z "${lm_test_text}" ] && lm_test_text="${data_feats}/${test_sets%% *}/text"
 
 # Check tokenization type
 token_listdir=data/token_list
@@ -296,623 +306,635 @@ fi
 asr_stats_dir="${expdir}/asr_stats_${feats_type}"
 lm_stats_dir="${expdir}/lm_stats"
 # The directory used for training commands
-asr_exp="${expdir}/asr_${asr_tag}"
-lm_exp="${expdir}/lm_${lm_tag}"
+if [ -z "${asr_exp}" ]; then
+    asr_exp="${expdir}/asr_${asr_tag}"
+fi
+if [ -z "${lm_exp}" ]; then
+    lm_exp="${expdir}/lm_${lm_tag}"
+fi
 
 # ========================== Main stages start from here. ==========================
 
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    log "Stage 1: Data preparation for data/${train_set}, data/${dev_set}, etc."
-    # [Task dependent] Need to create data.sh for new corpus
-    local/data.sh ${local_data_opts}
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    if [ -n "${speed_perturb_factors}" ]; then
-       log "Stage 2: Speed perturbation: data/${train_set} -> data/${train_set}_sp"
-       for factor in ${speed_perturb_factors}; do
-           if [[ $(bc <<<"${factor} != 1.0") == 1 ]]; then
-               scripts/utils/perturb_data_dir_speed.sh "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}"
-               _dirs+="data/${train_set}_sp${factor} "
-           else
-               # If speed factor is 1, same as the original
-               _dirs+="data/${train_set} "
-           fi
-       done
-       utils/combine_data.sh "data/${train_set}_sp" ${_dirs}
-    else
-       log "Skip stage 2: Speed perturbation"
+if ! "${skip_data_prep}"; then
+    if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+        log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc."
+        # [Task dependent] Need to create data.sh for new corpus
+        local/data.sh ${local_data_opts}
     fi
-fi
-
-if [ -n "${speed_perturb_factors}" ]; then
-    train_set="${train_set}_sp"
-fi
 
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    if [ "${feats_type}" = raw ]; then
-        log "Stage 3: Format wav.scp: data/ -> ${data_feats}"
-
-        # ====== Recreating "wav.scp" ======
-        # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
-        # shouldn't be used in training process.
-        # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
-        # and also it can also change the audio-format and sampling rate.
-        # If nothing is need, then format_wav_scp.sh does nothing:
-        # i.e. the input file format and rate is same as the output.
-
-        for dset in "${train_set}" "${dev_set}" ${eval_sets}; do
-            if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${dev_set}" ]; then
-                _suf="/org"
-            else
-                _suf=""
-            fi
-            utils/copy_data_dir.sh data/"${dset}" "${data_feats}${_suf}/${dset}"
-            rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel}
-            _opts=
-            if [ -e data/"${dset}"/segments ]; then
-                # "segments" is used for splitting wav files which are written in "wav".scp
-                # into utterances. The file format of segments:
-                #   <segment_id> <record_id> <start_time> <end_time>
-                #   "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5"
-                # Where the time is written in seconds.
-                _opts+="--segments data/${dset}/segments "
-            fi
-            # shellcheck disable=SC2086
-            scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
-                --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
-                "data/${dset}/wav.scp" "${data_feats}${_suf}/${dset}"
+    if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+        if [ -n "${speed_perturb_factors}" ]; then
+           log "Stage 2: Speed perturbation: data/${train_set} -> data/${train_set}_sp"
+           for factor in ${speed_perturb_factors}; do
+               if [[ $(bc <<<"${factor} != 1.0") == 1 ]]; then
+                   scripts/utils/perturb_data_dir_speed.sh "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}"
+                   _dirs+="data/${train_set}_sp${factor} "
+               else
+                   # If speed factor is 1, same as the original
+                   _dirs+="data/${train_set} "
+               fi
+           done
+           utils/combine_data.sh "data/${train_set}_sp" ${_dirs}
+        else
+           log "Skip stage 2: Speed perturbation"
+        fi
+    fi
 
-            echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
-        done
+    if [ -n "${speed_perturb_factors}" ]; then
+        train_set="${train_set}_sp"
+    fi
 
-    elif [ "${feats_type}" = fbank_pitch ]; then
-        log "[Require Kaldi] Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
+    if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+        if [ "${feats_type}" = raw ]; then
+            log "Stage 3: Format wav.scp: data/ -> ${data_feats}"
+
+            # ====== Recreating "wav.scp" ======
+            # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
+            # shouldn't be used in training process.
+            # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
+            # and it can also change the audio-format and sampling rate.
+            # If nothing is need, then format_wav_scp.sh does nothing:
+            # i.e. the input file format and rate is same as the output.
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                utils/copy_data_dir.sh data/"${dset}" "${data_feats}${_suf}/${dset}"
+                rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel}
+                _opts=
+                if [ -e data/"${dset}"/segments ]; then
+                    # "segments" is used for splitting wav files which are written in "wav".scp
+                    # into utterances. The file format of segments:
+                    #   <segment_id> <record_id> <start_time> <end_time>
+                    #   "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5"
+                    # Where the time is written in seconds.
+                    _opts+="--segments data/${dset}/segments "
+                fi
+                # shellcheck disable=SC2086
+                scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                    --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
+                    "data/${dset}/wav.scp" "${data_feats}${_suf}/${dset}"
+
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+
+        elif [ "${feats_type}" = fbank_pitch ]; then
+            log "[Require Kaldi] Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                # 1. Copy datadir
+                utils/copy_data_dir.sh data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                # 2. Feature extract
+                _nj=$(min "${nj}" "$(<"${data_feats}${_suf}/${dset}/utt2spk" wc -l)")
+                steps/make_fbank_pitch.sh --nj "${_nj}" --cmd "${train_cmd}" "${data_feats}${_suf}/${dset}"
+                utils/fix_data_dir.sh "${data_feats}${_suf}/${dset}"
+
+                # 3. Derive the the frame length and feature dimension
+                scripts/feats/feat_to_shape.sh --nj "${_nj}" --cmd "${train_cmd}" \
+                    "${data_feats}${_suf}/${dset}/feats.scp" "${data_feats}${_suf}/${dset}/feats_shape"
+
+                # 4. Write feats_dim
+                head -n 1 "${data_feats}${_suf}/${dset}/feats_shape" | awk '{ print $2 }' \
+                    | cut -d, -f2 > ${data_feats}${_suf}/${dset}/feats_dim
+
+                # 5. Write feats_type
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+
+        elif [ "${feats_type}" = fbank ]; then
+            log "Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
+            log "${feats_type} is not supported yet."
+            exit 1
+
+        elif  [ "${feats_type}" = extracted ]; then
+            log "Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
+            # Assumming you don't have wav.scp, but feats.scp is created by local/data.sh instead.
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                # Generate dummy wav.scp to avoid error by copy_data_dir.sh
+                <data/"${dset}"/cmvn.scp awk ' { print($1,"<DUMMY>") }' > data/"${dset}"/wav.scp
+                utils/copy_data_dir.sh data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                pyscripts/feats/feat-to-shape.py "scp:head -n 1 ${data_feats}${_suf}/${dset}/feats.scp |" - | \
+                    awk '{ print $2 }' | cut -d, -f2 > "${data_feats}${_suf}/${dset}/feats_dim"
+
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
 
-        for dset in "${train_set}" "${dev_set}" ${eval_sets}; do
-            if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${dev_set}" ]; then
-                _suf="/org"
-            else
-                _suf=""
-            fi
-            # 1. Copy datadir
-            utils/copy_data_dir.sh data/"${dset}" "${data_feats}${_suf}/${dset}"
-
-            # 2. Feature extract
-            _nj=$(min "${nj}" "$(<"${data_feats}${_suf}/${dset}/utt2spk" wc -l)")
-            steps/make_fbank_pitch.sh --nj "${_nj}" --cmd "${train_cmd}" "${data_feats}${_suf}/${dset}"
-            utils/fix_data_dir.sh "${data_feats}${_suf}/${dset}"
+        else
+            log "Error: not supported: --feats_type ${feats_type}"
+            exit 2
+        fi
+    fi
 
-            # 3. Derive the the frame length and feature dimension
-            scripts/feats/feat_to_shape.sh --nj "${_nj}" --cmd "${train_cmd}" \
-                "${data_feats}${_suf}/${dset}/feats.scp" "${data_feats}${_suf}/${dset}/feats_shape"
 
-            # 4. Write feats_dim
-            head -n 1 "${data_feats}${_suf}/${dset}/feats_shape" | awk '{ print $2 }' \
-                | cut -d, -f2 > ${data_feats}${_suf}/${dset}/feats_dim
+    if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+        log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}"
 
-            # 5. Write feats_type
-            echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
-        done
+        # NOTE(kamo): Not applying to test_sets to keep original data
+        for dset in "${train_set}" "${valid_set}"; do
 
-    elif [ "${feats_type}" = fbank ]; then
-        log "Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
-        log "${feats_type} is not supported yet."
-        exit 1
+            # Copy data dir
+            utils/copy_data_dir.sh "${data_feats}/org/${dset}" "${data_feats}/${dset}"
+            cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
 
-    elif  [ "${feats_type}" = extracted ]; then
-        log "Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
-        # Assumming you don't have wav.scp, but feats.scp is created by local/data.sh instead.
+            # Remove short utterances
+            _feats_type="$(<${data_feats}/${dset}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
+                _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
 
-        for dset in "${train_set}" "${dev_set}" ${eval_sets}; do
-            if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${dev_set}" ]; then
-                _suf="/org"
+                # utt2num_samples is created by format_wav_scp.sh
+                <"${data_feats}/org/${dset}/utt2num_samples" \
+                    awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
+                        >"${data_feats}/${dset}/utt2num_samples"
+                <"${data_feats}/org/${dset}/wav.scp" \
+                    utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
+                    >"${data_feats}/${dset}/wav.scp"
             else
-                _suf=""
+                # Get frame shift in ms from conf/fbank.conf
+                _frame_shift=
+                if [ -f conf/fbank.conf ] && [ "$(<conf/fbank.conf grep -c frame-shift)" -gt 0 ]; then
+                    # Assume using conf/fbank.conf for feature extraction
+                    _frame_shift="$(<conf/fbank.conf grep frame-shift | sed -e 's/[-a-z =]*\([0-9]*\)/\1/g')"
+                fi
+                if [ -z "${_frame_shift}" ]; then
+                    # If not existing, use the default number in Kaldi (=10ms).
+                    # If you are using different number, you have to change the following value manually.
+                    _frame_shift=10
+                fi
+
+                _min_length=$(python3 -c "print(int(${min_wav_duration} / ${_frame_shift} * 1000))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} / ${_frame_shift} * 1000))")
+
+                cp "${data_feats}/org/${dset}/feats_dim" "${data_feats}/${dset}/feats_dim"
+                <"${data_feats}/org/${dset}/feats_shape" awk -F, ' { print $1 } ' \
+                    | awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length) print $0; }' \
+                        >"${data_feats}/${dset}/feats_shape"
+                <"${data_feats}/org/${dset}/feats.scp" \
+                    utils/filter_scp.pl "${data_feats}/${dset}/feats_shape"  \
+                    >"${data_feats}/${dset}/feats.scp"
             fi
-            # Generate dummy wav.scp to avoid error by copy_data_dir.sh
-            <data/"${dset}"/cmvn.scp awk ' { print($1,"<DUMMY>") }' > data/"${dset}"/wav.scp
-            utils/copy_data_dir.sh data/"${dset}" "${data_feats}${_suf}/${dset}"
 
-            pyscripts/feats/feat-to-shape.py "scp:head -n 1 ${data_feats}${_suf}/${dset}/feats.scp |" - | \
-                awk '{ print $2 }' | cut -d, -f2 > "${data_feats}${_suf}/${dset}/feats_dim"
+            # Remove empty text
+            <"${data_feats}/org/${dset}/text" \
+                awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
 
-            echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            # fix_data_dir.sh leaves only utts which exist in all files
+            utils/fix_data_dir.sh "${data_feats}/${dset}"
         done
 
-    else
-        log "Error: not supported: --feats_type ${feats_type}"
-        exit 2
+        # shellcheck disable=SC2002
+        cat ${srctexts} | awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/srctexts"
     fi
-fi
 
 
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-    log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}"
+    if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+        if [ "${token_type}" = bpe ]; then
+            log "Stage 5: Generate token_list from ${data_feats}/srctexts using BPE"
 
-    # NOTE(kamo): Not applying to eval_sets to keep original data
-    for dset in "${train_set}" "${dev_set}"; do
+            mkdir -p "${bpedir}"
+            # shellcheck disable=SC2002
+            <"${data_feats}/srctexts" cut -f 2- -d" "  > "${bpedir}"/train.txt
 
-        # Copy data dir
-        utils/copy_data_dir.sh "${data_feats}/org/${dset}" "${data_feats}/${dset}"
-        cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
-
-        # Remove short utterances
-        _feats_type="$(<${data_feats}/${dset}/feats_type)"
-        if [ "${_feats_type}" = raw ]; then
-            _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
-            _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
-            _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
-
-            # utt2num_samples is created by format_wav_scp.sh
-            <"${data_feats}/org/${dset}/utt2num_samples" \
-                awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
-                    '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
-                    >"${data_feats}/${dset}/utt2num_samples"
-            <"${data_feats}/org/${dset}/wav.scp" \
-                utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
-                >"${data_feats}/${dset}/wav.scp"
-        else
-            # Get frame shift in ms from conf/fbank.conf
-            _frame_shift=
-            if [ -f conf/fbank.conf ] && [ "$(<conf/fbank.conf grep -c frame-shift)" -gt 0 ]; then
-                # Assume using conf/fbank.conf for feature extraction
-                _frame_shift="$(<conf/fbank.conf grep frame-shift | sed -e 's/[-a-z =]*\([0-9]*\)/\1/g')"
-            fi
-            if [ -z "${_frame_shift}" ]; then
-                # If not existing, use the default number in Kaldi (=10ms).
-                # If you are using different number, you have to change the following value manually.
-                _frame_shift=10
+            if [ -n "${bpe_nlsyms}" ]; then
+                _opts_spm="--user_defined_symbols=${bpe_nlsyms}"
+            else
+                _opts_spm=""
             fi
 
-            _min_length=$(python3 -c "print(int(${min_wav_duration} / ${_frame_shift} * 1000))")
-            _max_length=$(python3 -c "print(int(${max_wav_duration} / ${_frame_shift} * 1000))")
-
-            cp "${data_feats}/org/${dset}/feats_dim" "${data_feats}/${dset}/feats_dim"
-            <"${data_feats}/org/${dset}/feats_shape" awk -F, ' { print $1 } ' \
-                | awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
-                    '{ if ($2 > min_length && $2 < max_length) print $0; }' \
-                    >"${data_feats}/${dset}/feats_shape"
-            <"${data_feats}/org/${dset}/feats.scp" \
-                utils/filter_scp.pl "${data_feats}/${dset}/feats_shape"  \
-                >"${data_feats}/${dset}/feats.scp"
-        fi
-
-        # Remove empty text
-        <"${data_feats}/org/${dset}/text" \
-            awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
+            spm_train \
+                --input="${bpedir}"/train.txt \
+                --vocab_size="${nbpe}" \
+                --model_type="${bpemode}" \
+                --model_prefix="${bpeprefix}" \
+                --character_coverage=${bpe_char_cover} \
+                --input_sentence_size="${bpe_input_sentence_size}" \
+                ${_opts_spm}
 
-        # fix_data_dir.sh leaves only utts which exist in all files
-        utils/fix_data_dir.sh "${data_feats}/${dset}"
-    done
-
-    # shellcheck disable=SC2002
-    cat ${srctexts} | awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/srctexts"
-fi
+            _opts="--bpemodel ${bpemodel}"
 
+        elif [ "${token_type}" = char ]; then
+            log "Stage 5: Generate character level token_list from ${data_feats}/srctexts"
+            _opts="--non_linguistic_symbols ${nlsyms_txt}"
 
-if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-    if [ "${token_type}" = bpe ]; then
-        log "Stage 5: Generate token_list from ${data_feats}/srctexts using BPE"
-
-        mkdir -p "${bpedir}"
-        # shellcheck disable=SC2002
-        <"${data_feats}/srctexts" cut -f 2- -d" "  > "${bpedir}"/train.txt
-
-        if [ -n "${bpe_nlsyms}" ]; then
-            _opts_spm="--user_defined_symbols=${bpe_nlsyms}"
         else
-            _opts_spm=""
+            log "Error: not supported --token_type '${token_type}'"
+            exit 2
         fi
 
-        spm_train \
-            --input="${bpedir}"/train.txt \
-            --vocab_size="${nbpe}" \
-            --model_type="${bpemode}" \
-            --model_prefix="${bpeprefix}" \
-            --character_coverage=${bpe_char_cover} \
-            --input_sentence_size="${bpe_input_sentence_size}" \
-            ${_opts_spm}
-
-        _opts="--bpemodel ${bpemodel}"
+        # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+        # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
 
-    elif [ "${token_type}" = char ]; then
-        log "Stage 5: Generate character level token_list from ${data_feats}/srctexts"
-        _opts="--non_linguistic_symbols ${nlsyms_txt}"
-
-    else
-        log "Error: not supported --token_type '${token_type}'"
-        exit 2
-    fi
-
-    # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
-    # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
-
-    python3 -m espnet2.bin.tokenize_text  \
-        --token_type "${token_type}" \
-        --input "${data_feats}/srctexts" --output "${token_list}" ${_opts} \
-        --field 2- \
-        --cleaner "${cleaner}" \
-        --g2p "${g2p}" \
-        --write_vocabulary true \
-        --add_symbol "${blank}:0" \
-        --add_symbol "${oov}:1" \
-        --add_symbol "${sos_eos}:-1"
-
-    # Create word-list for word-LM training
-    if ${use_word_lm}; then
-        log "Generate word level token_list from ${data_feats}/srctexts"
-        python3 -m espnet2.bin.tokenize_text \
-            --token_type word \
-            --input "${data_feats}/srctexts" --output "${lm_token_list}" \
+        python3 -m espnet2.bin.tokenize_text  \
+            --token_type "${token_type}" \
+            --input "${data_feats}/srctexts" --output "${token_list}" ${_opts} \
             --field 2- \
             --cleaner "${cleaner}" \
             --g2p "${g2p}" \
             --write_vocabulary true \
-            --vocabulary_size "${word_vocab_size}" \
             --add_symbol "${blank}:0" \
             --add_symbol "${oov}:1" \
             --add_symbol "${sos_eos}:-1"
-    fi
 
+        # Create word-list for word-LM training
+        if ${use_word_lm}; then
+            log "Generate word level token_list from ${data_feats}/srctexts"
+            python3 -m espnet2.bin.tokenize_text \
+                --token_type word \
+                --input "${data_feats}/srctexts" --output "${lm_token_list}" \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --vocabulary_size "${word_vocab_size}" \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+        fi
+
+    fi
+else
+    log "Skip the stages for data preparation"
 fi
 
 
 # ========================== Data preparation is done here. ==========================
 
 
-if "${use_lm}"; then
-  if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
-      log "Stage 6: LM collect stats: train_set=${data_feats}/srctexts, dev_set=${lm_dev_text}"
+if ! "${skip_train}"; then
+    if "${use_lm}"; then
+        if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+            log "Stage 6: LM collect stats: train_set=${data_feats}/srctexts, dev_set=${lm_dev_text}"
 
-      _opts=
-      if [ -n "${lm_config}" ]; then
-          # To generate the config file: e.g.
-          #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
-          _opts+="--config ${lm_config} "
-      fi
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
 
-      # 1. Split the key file
-      _logdir="${lm_stats_dir}/logdir"
-      mkdir -p "${_logdir}"
-      # Get the minimum number among ${nj} and the number lines of input files
-      _nj=$(min "${nj}" "$(<${data_feats}/srctexts wc -l)" "$(<${lm_dev_text} wc -l)")
-
-      key_file="${data_feats}/srctexts"
-      split_scps=""
-      for n in $(seq ${_nj}); do
-          split_scps+=" ${_logdir}/train.${n}.scp"
-      done
-      # shellcheck disable=SC2086
-      utils/split_scp.pl "${key_file}" ${split_scps}
-
-      key_file="${lm_dev_text}"
-      split_scps=""
-      for n in $(seq ${_nj}); do
-          split_scps+=" ${_logdir}/dev.${n}.scp"
-      done
-      # shellcheck disable=SC2086
-      utils/split_scp.pl "${key_file}" ${split_scps}
-
-      # 2. Submit jobs
-      log "LM collect-stats started... log: '${_logdir}/stats.*.log'"
-      # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
-      #       but it's used only for deciding the sample ids.
-
-      # shellcheck disable=SC2086
-      ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
-          python3 -m espnet2.bin.lm_train \
-              --collect_stats true \
-              --use_preprocessor true \
-              --bpemodel "${bpemodel}" \
-              --token_type "${lm_token_type}"\
-              --token_list "${lm_token_list}" \
-              --non_linguistic_symbols "${nlsyms_txt}" \
-              --cleaner "${cleaner}" \
-              --g2p "${g2p}" \
-              --train_data_path_and_name_and_type "${data_feats}/srctexts,text,text" \
-              --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
-              --train_shape_file "${_logdir}/train.JOB.scp" \
-              --valid_shape_file "${_logdir}/dev.JOB.scp" \
-              --output_dir "${_logdir}/stats.JOB" \
-              ${_opts} ${lm_args}
-
-      # 3. Aggregate shape files
-      _opts=
-      for i in $(seq "${_nj}"); do
-          _opts+="--input_dir ${_logdir}/stats.${i} "
-      done
-      # shellcheck disable=SC2086
-      python3 -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"
-
-      # Append the num-tokens at the last dimensions. This is used for batch-bins count
-      <"${lm_stats_dir}/train/text_shape" \
-          awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
-          >"${lm_stats_dir}/train/text_shape.${lm_token_type}"
-
-      <"${lm_stats_dir}/valid/text_shape" \
-          awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
-          >"${lm_stats_dir}/valid/text_shape.${lm_token_type}"
-  fi
-
-
-  if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
-      log "Stage 7: LM Training: train_set=${data_feats}/srctexts, dev_set=${lm_dev_text}"
-
-      _opts=
-      if [ -n "${lm_config}" ]; then
-          # To generate the config file: e.g.
-          #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
-          _opts+="--config ${lm_config} "
-      fi
+            # 1. Split the key file
+            _logdir="${lm_stats_dir}/logdir"
+            mkdir -p "${_logdir}"
+            # Get the minimum number among ${nj} and the number lines of input files
+            _nj=$(min "${nj}" "$(<${data_feats}/srctexts wc -l)" "$(<${lm_dev_text} wc -l)")
+
+            key_file="${data_feats}/srctexts"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/train.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
 
-      if [ "${num_splits_lm}" -gt 1 ]; then
-          # If you met a memory error when parsing text files, this option may help you.
-          # The corpus is split into subsets and each subset is used for training one by one in order,
-          # so the memory footprint can be limited to the memory required for each dataset.
-
-          _split_dir="${lm_stats_dir}/splits${num_splits_lm}"
-          if [ ! -f "${_split_dir}/.done" ]; then
-              rm -f "${_split_dir}/.done"
-              python3 -m espnet2.bin.split_scps \
-                --scps "${data_feats}/srctexts" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
-                --num_splits "${num_splits_lm}" \
-                --output_dir "${_split_dir}"
-              touch "${_split_dir}/.done"
-          else
-              log "${_split_dir}/.done exists. Spliting is skipped"
-          fi
-
-          _opts+="--train_data_path_and_name_and_type ${_split_dir}/srctexts,text,text "
-          _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
-          _opts+="--multiple_iterator true "
-
-      else
-          _opts+="--train_data_path_and_name_and_type ${data_feats}/srctexts,text,text "
-          _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
-      fi
+            key_file="${lm_dev_text}"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/dev.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
 
-      # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
-
-      log "LM training started... log: '${lm_exp}/train.log'"
-      # shellcheck disable=SC2086
-      python3 -m espnet2.bin.launch \
-          --cmd "${cuda_cmd} --name ${lm_exp}/train.log" \
-          --log "${lm_exp}"/train.log \
-          --ngpu "${ngpu}" \
-          --num_nodes "${num_nodes}" \
-          --init_file_prefix "${asr_exp}"/.dist_init_ \
-          --multiprocessing_distributed true -- \
-          python3 -m espnet2.bin.lm_train \
-              --ngpu "${ngpu}" \
-              --use_preprocessor true \
-              --bpemodel "${bpemodel}" \
-              --token_type "${lm_token_type}"\
-              --token_list "${lm_token_list}" \
-              --non_linguistic_symbols "${nlsyms_txt}" \
-              --cleaner "${cleaner}" \
-              --g2p "${g2p}" \
-              --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
-              --valid_shape_file "${lm_stats_dir}/valid/text_shape.${lm_token_type}" \
-              --fold_length "${lm_fold_length}" \
-              --resume true \
-              --output_dir "${lm_exp}" \
-              ${_opts} ${lm_args}
-
-  fi
-
-
-  if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
-      log "Stage 8: Calc perplexity: ${lm_test_text}"
-      _opts=
-      # TODO(kamo): Parallelize?
-      log "Perplexity calculation started... log: '${lm_exp}/perplexity_test/lm_calc_perplexity.log'"
-      # shellcheck disable=SC2086
-      ${cuda_cmd} --gpu "${ngpu}" "${lm_exp}"/perplexity_test/lm_calc_perplexity.log \
-          python3 -m espnet2.bin.lm_calc_perplexity \
-              --ngpu "${ngpu}" \
-              --data_path_and_name_and_type "${lm_test_text},text,text" \
-              --train_config "${lm_exp}"/config.yaml \
-              --model_file "${lm_exp}/${decode_lm}" \
-              --output_dir "${lm_exp}/perplexity_test" \
-              ${_opts}
-      log "PPL: ${lm_test_text}: $(cat ${lm_exp}/perplexity_test/ppl)"
-
-  fi
+            # 2. Submit jobs
+            log "LM collect-stats started... log: '${_logdir}/stats.*.log'"
+            # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+            #       but it's used only for deciding the sample ids.
 
-else
-    log "Stage 6-8: Skip lm-related stages: use_lm=${use_lm}"
-fi
+            # shellcheck disable=SC2086
+            ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+                python3 -m espnet2.bin.lm_train \
+                    --collect_stats true \
+                    --use_preprocessor true \
+                    --bpemodel "${bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --train_data_path_and_name_and_type "${data_feats}/srctexts,text,text" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --train_shape_file "${_logdir}/train.JOB.scp" \
+                    --valid_shape_file "${_logdir}/dev.JOB.scp" \
+                    --output_dir "${_logdir}/stats.JOB" \
+                    ${_opts} ${lm_args}
+
+            # 3. Aggregate shape files
+            _opts=
+            for i in $(seq "${_nj}"); do
+                _opts+="--input_dir ${_logdir}/stats.${i} "
+            done
+            # shellcheck disable=SC2086
+            python3 -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"
 
+            # Append the num-tokens at the last dimensions. This is used for batch-bins count
+            <"${lm_stats_dir}/train/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/train/text_shape.${lm_token_type}"
 
-if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
-    _asr_train_dir="${data_feats}/${train_set}"
-    _asr_dev_dir="${data_feats}/${dev_set}"
-    log "Stage 9: ASR collect stats: train_set=${_asr_train_dir}, dev_set=${_asr_dev_dir}"
+            <"${lm_stats_dir}/valid/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/valid/text_shape.${lm_token_type}"
+        fi
 
-    _opts=
-    if [ -n "${asr_config}" ]; then
-        # To generate the config file: e.g.
-        #   % python3 -m espnet2.bin.asr_train --print_config --optim adam
-        _opts+="--config ${asr_config} "
-    fi
 
-    _feats_type="$(<${_asr_train_dir}/feats_type)"
-    if [ "${_feats_type}" = raw ]; then
-        _scp=wav.scp
-        # "sound" supports "wav", "flac", etc.
-        _type=sound
-        _opts+="--frontend_conf fs=${fs} "
+        if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+            log "Stage 7: LM Training: train_set=${data_feats}/srctexts, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            if [ "${num_splits_lm}" -gt 1 ]; then
+                # If you met a memory error when parsing text files, this option may help you.
+                # The corpus is split into subsets and each subset is used for training one by one in order,
+                # so the memory footprint can be limited to the memory required for each dataset.
+
+                _split_dir="${lm_stats_dir}/splits${num_splits_lm}"
+                if [ ! -f "${_split_dir}/.done" ]; then
+                    rm -f "${_split_dir}/.done"
+                    python3 -m espnet2.bin.split_scps \
+                      --scps "${data_feats}/srctexts" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
+                      --num_splits "${num_splits_lm}" \
+                      --output_dir "${_split_dir}"
+                    touch "${_split_dir}/.done"
+                else
+                    log "${_split_dir}/.done exists. Spliting is skipped"
+                fi
+
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/srctexts,text,text "
+                _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
+                _opts+="--multiple_iterator true "
+
+            else
+                _opts+="--train_data_path_and_name_and_type ${data_feats}/srctexts,text,text "
+                _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
+            fi
+
+            # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+
+            log "LM training started... log: '${lm_exp}/train.log'"
+            # shellcheck disable=SC2086
+            python3 -m espnet2.bin.launch \
+                --cmd "${cuda_cmd} --name ${lm_exp}/train.log" \
+                --log "${lm_exp}"/train.log \
+                --ngpu "${ngpu}" \
+                --num_nodes "${num_nodes}" \
+                --init_file_prefix "${asr_exp}"/.dist_init_ \
+                --multiprocessing_distributed true -- \
+                python3 -m espnet2.bin.lm_train \
+                    --ngpu "${ngpu}" \
+                    --use_preprocessor true \
+                    --bpemodel "${bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --valid_shape_file "${lm_stats_dir}/valid/text_shape.${lm_token_type}" \
+                    --fold_length "${lm_fold_length}" \
+                    --resume true \
+                    --output_dir "${lm_exp}" \
+                    ${_opts} ${lm_args}
+
+        fi
+
+
+        if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+            log "Stage 8: Calc perplexity: ${lm_test_text}"
+            _opts=
+            # TODO(kamo): Parallelize?
+            log "Perplexity calculation started... log: '${lm_exp}/perplexity_test/lm_calc_perplexity.log'"
+            # shellcheck disable=SC2086
+            ${cuda_cmd} --gpu "${ngpu}" "${lm_exp}"/perplexity_test/lm_calc_perplexity.log \
+                python3 -m espnet2.bin.lm_calc_perplexity \
+                    --ngpu "${ngpu}" \
+                    --data_path_and_name_and_type "${lm_test_text},text,text" \
+                    --train_config "${lm_exp}"/config.yaml \
+                    --model_file "${lm_exp}/${decode_lm}" \
+                    --output_dir "${lm_exp}/perplexity_test" \
+                    ${_opts}
+            log "PPL: ${lm_test_text}: $(cat ${lm_exp}/perplexity_test/ppl)"
+
+        fi
+
     else
-        _scp=feats.scp
-        _type=kaldi_ark
-        _input_size="$(<${_asr_train_dir}/feats_dim)"
-        _opts+="--input_size=${_input_size} "
+        log "Stage 6-8: Skip lm-related stages: use_lm=${use_lm}"
     fi
 
-    # 1. Split the key file
-    _logdir="${asr_stats_dir}/logdir"
-    mkdir -p "${_logdir}"
 
-    # Get the minimum number among ${nj} and the number lines of input files
-    _nj=$(min "${nj}" "$(<${_asr_train_dir}/${_scp} wc -l)" "$(<${_asr_dev_dir}/${_scp} wc -l)")
+    if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+        _asr_train_dir="${data_feats}/${train_set}"
+        _asr_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 9: ASR collect stats: train_set=${_asr_train_dir}, valid_set=${_asr_valid_dir}"
 
-    key_file="${_asr_train_dir}/${_scp}"
-    split_scps=""
-    for n in $(seq "${_nj}"); do
-        split_scps+=" ${_logdir}/train.${n}.scp"
-    done
-    # shellcheck disable=SC2086
-    utils/split_scp.pl "${key_file}" ${split_scps}
+        _opts=
+        if [ -n "${asr_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.asr_train --print_config --optim adam
+            _opts+="--config ${asr_config} "
+        fi
 
-    key_file="${_asr_dev_dir}/${_scp}"
-    split_scps=""
-    for n in $(seq "${_nj}"); do
-        split_scps+=" ${_logdir}/dev.${n}.scp"
-    done
-    # shellcheck disable=SC2086
-    utils/split_scp.pl "${key_file}" ${split_scps}
+        _feats_type="$(<${_asr_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            # "sound" supports "wav", "flac", etc.
+            _type=sound
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _input_size="$(<${_asr_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+        fi
 
-    # 2. Submit jobs
-    log "ASR collect-stats started... log: '${_logdir}/stats.*.log'"
+        # 1. Split the key file
+        _logdir="${asr_stats_dir}/logdir"
+        mkdir -p "${_logdir}"
 
-    # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
-    #       but it's used only for deciding the sample ids.
+        # Get the minimum number among ${nj} and the number lines of input files
+        _nj=$(min "${nj}" "$(<${_asr_train_dir}/${_scp} wc -l)" "$(<${_asr_valid_dir}/${_scp} wc -l)")
 
-    # shellcheck disable=SC2086
-    ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
-        python3 -m espnet2.bin.asr_train \
-            --collect_stats true \
-            --use_preprocessor true \
-            --bpemodel "${bpemodel}" \
-            --token_type "${token_type}" \
-            --token_list "${token_list}" \
-            --non_linguistic_symbols "${nlsyms_txt}" \
-            --cleaner "${cleaner}" \
-            --g2p "${g2p}" \
-            --train_data_path_and_name_and_type "${_asr_train_dir}/${_scp},speech,${_type}" \
-            --train_data_path_and_name_and_type "${_asr_train_dir}/text,text,text" \
-            --valid_data_path_and_name_and_type "${_asr_dev_dir}/${_scp},speech,${_type}" \
-            --valid_data_path_and_name_and_type "${_asr_dev_dir}/text,text,text" \
-            --train_shape_file "${_logdir}/train.JOB.scp" \
-            --valid_shape_file "${_logdir}/dev.JOB.scp" \
-            --output_dir "${_logdir}/stats.JOB" \
-            ${_opts} ${asr_args}
-
-    # 3. Aggregate shape files
-    _opts=
-    for i in $(seq "${_nj}"); do
-        _opts+="--input_dir ${_logdir}/stats.${i} "
-    done
-    # shellcheck disable=SC2086
-    python3 -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${asr_stats_dir}"
+        key_file="${_asr_train_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/train.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
 
-    # Append the num-tokens at the last dimensions. This is used for batch-bins count
-    <"${asr_stats_dir}/train/text_shape" \
-        awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
-        >"${asr_stats_dir}/train/text_shape.${token_type}"
+        key_file="${_asr_valid_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/valid.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
 
-    <"${asr_stats_dir}/valid/text_shape" \
-        awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
-        >"${asr_stats_dir}/valid/text_shape.${token_type}"
-fi
+        # 2. Submit jobs
+        log "ASR collect-stats started... log: '${_logdir}/stats.*.log'"
 
+        # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+        #       but it's used only for deciding the sample ids.
 
-if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
-    _asr_train_dir="${data_feats}/${train_set}"
-    _asr_dev_dir="${data_feats}/${dev_set}"
-    log "Stage 10: ASR Training: train_set=${_asr_train_dir}, dev_set=${_asr_dev_dir}"
+        # shellcheck disable=SC2086
+        ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+            python3 -m espnet2.bin.asr_train \
+                --collect_stats true \
+                --use_preprocessor true \
+                --bpemodel "${bpemodel}" \
+                --token_type "${token_type}" \
+                --token_list "${token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --train_data_path_and_name_and_type "${_asr_train_dir}/${_scp},speech,${_type}" \
+                --train_data_path_and_name_and_type "${_asr_train_dir}/text,text,text" \
+                --valid_data_path_and_name_and_type "${_asr_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_asr_valid_dir}/text,text,text" \
+                --train_shape_file "${_logdir}/train.JOB.scp" \
+                --valid_shape_file "${_logdir}/valid.JOB.scp" \
+                --output_dir "${_logdir}/stats.JOB" \
+                ${_opts} ${asr_args}
+
+        # 3. Aggregate shape files
+        _opts=
+        for i in $(seq "${_nj}"); do
+            _opts+="--input_dir ${_logdir}/stats.${i} "
+        done
+        # shellcheck disable=SC2086
+        python3 -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${asr_stats_dir}"
 
-    _opts=
-    if [ -n "${asr_config}" ]; then
-        # To generate the config file: e.g.
-        #   % python3 -m espnet2.bin.asr_train --print_config --optim adam
-        _opts+="--config ${asr_config} "
+        # Append the num-tokens at the last dimensions. This is used for batch-bins count
+        <"${asr_stats_dir}/train/text_shape" \
+            awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
+            >"${asr_stats_dir}/train/text_shape.${token_type}"
+
+        <"${asr_stats_dir}/valid/text_shape" \
+            awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
+            >"${asr_stats_dir}/valid/text_shape.${token_type}"
     fi
 
-    _feats_type="$(<${_asr_train_dir}/feats_type)"
-    if [ "${_feats_type}" = raw ]; then
-        _scp=wav.scp
-        # "sound" supports "wav", "flac", etc.
-        _type=sound
-        _fold_length="$((asr_speech_fold_length * 100))"
-        _opts+="--frontend_conf fs=${fs} "
-    else
-        _scp=feats.scp
-        _type=kaldi_ark
-        _fold_length="${asr_speech_fold_length}"
-        _input_size="$(<${_asr_train_dir}/feats_dim)"
-        _opts+="--input_size=${_input_size} "
 
-    fi
-    if [ "${feats_normalize}" = global_mvn ]; then
-        # Default normalization is utterance_mvn and changes to global_mvn
-        _opts+="--normalize=global_mvn --normalize_conf stats_file=${asr_stats_dir}/train/feats_stats.npz "
-    fi
+    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+        _asr_train_dir="${data_feats}/${train_set}"
+        _asr_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 10: ASR Training: train_set=${_asr_train_dir}, valid_set=${_asr_valid_dir}"
+
+        _opts=
+        if [ -n "${asr_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.asr_train --print_config --optim adam
+            _opts+="--config ${asr_config} "
+        fi
 
-    if [ "${num_splits_asr}" -gt 1 ]; then
-        # If you met a memory error when parsing text files, this option may help you.
-        # The corpus is split into subsets and each subset is used for training one by one in order,
-        # so the memory footprint can be limited to the memory required for each dataset.
-
-        _split_dir="${asr_stats_dir}/splits${num_splits_asr}"
-        if [ ! -f "${_split_dir}/.done" ]; then
-            rm -f "${_split_dir}/.done"
-            python3 -m espnet2.bin.split_scps \
-              --scps \
-                  "${_asr_train_dir}/${_scp}" \
-                  "${_asr_train_dir}/text" \
-                  "${asr_stats_dir}/train/speech_shape" \
-                  "${asr_stats_dir}/train/text_shape.${token_type}" \
-              --num_splits "${num_splits_asr}" \
-              --output_dir "${_split_dir}"
-            touch "${_split_dir}/.done"
+        _feats_type="$(<${_asr_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            # "sound" supports "wav", "flac", etc.
+            _type=sound
+            _fold_length="$((asr_speech_fold_length * 100))"
+            _opts+="--frontend_conf fs=${fs} "
         else
-            log "${_split_dir}/.done exists. Spliting is skipped"
+            _scp=feats.scp
+            _type=kaldi_ark
+            _fold_length="${asr_speech_fold_length}"
+            _input_size="$(<${_asr_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            # Default normalization is utterance_mvn and changes to global_mvn
+            _opts+="--normalize=global_mvn --normalize_conf stats_file=${asr_stats_dir}/train/feats_stats.npz "
         fi
 
-        _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
-        _opts+="--train_data_path_and_name_and_type ${_split_dir}/text,text,text "
-        _opts+="--train_shape_file ${_split_dir}/speech_shape "
-        _opts+="--train_shape_file ${_split_dir}/text_shape.${token_type} "
-        _opts+="--multiple_iterator true "
+        if [ "${num_splits_asr}" -gt 1 ]; then
+            # If you met a memory error when parsing text files, this option may help you.
+            # The corpus is split into subsets and each subset is used for training one by one in order,
+            # so the memory footprint can be limited to the memory required for each dataset.
+
+            _split_dir="${asr_stats_dir}/splits${num_splits_asr}"
+            if [ ! -f "${_split_dir}/.done" ]; then
+                rm -f "${_split_dir}/.done"
+                python3 -m espnet2.bin.split_scps \
+                  --scps \
+                      "${_asr_train_dir}/${_scp}" \
+                      "${_asr_train_dir}/text" \
+                      "${asr_stats_dir}/train/speech_shape" \
+                      "${asr_stats_dir}/train/text_shape.${token_type}" \
+                  --num_splits "${num_splits_asr}" \
+                  --output_dir "${_split_dir}"
+                touch "${_split_dir}/.done"
+            else
+                log "${_split_dir}/.done exists. Spliting is skipped"
+            fi
 
-    else
-        _opts+="--train_data_path_and_name_and_type ${_asr_train_dir}/${_scp},speech,${_type} "
-        _opts+="--train_data_path_and_name_and_type ${_asr_train_dir}/text,text,text "
-        _opts+="--train_shape_file ${asr_stats_dir}/train/speech_shape "
-        _opts+="--train_shape_file ${asr_stats_dir}/train/text_shape.${token_type} "
-    fi
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text,text,text "
+            _opts+="--train_shape_file ${_split_dir}/speech_shape "
+            _opts+="--train_shape_file ${_split_dir}/text_shape.${token_type} "
+            _opts+="--multiple_iterator true "
+
+        else
+            _opts+="--train_data_path_and_name_and_type ${_asr_train_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_asr_train_dir}/text,text,text "
+            _opts+="--train_shape_file ${asr_stats_dir}/train/speech_shape "
+            _opts+="--train_shape_file ${asr_stats_dir}/train/text_shape.${token_type} "
+        fi
 
-    # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+        # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
 
-    log "ASR training started... log: '${asr_exp}/train.log'"
-    # shellcheck disable=SC2086
-    python3 -m espnet2.bin.launch \
-        --cmd "${cuda_cmd} --name ${asr_exp}/train.log" \
-        --log "${asr_exp}"/train.log \
-        --ngpu "${ngpu}" \
-        --num_nodes "${num_nodes}" \
-        --init_file_prefix "${asr_exp}"/.dist_init_ \
-        --multiprocessing_distributed true -- \
-        python3 -m espnet2.bin.asr_train \
-            --use_preprocessor true \
-            --bpemodel "${bpemodel}" \
-            --token_type "${token_type}" \
-            --token_list "${token_list}" \
-            --non_linguistic_symbols "${nlsyms_txt}" \
-            --cleaner "${cleaner}" \
-            --g2p "${g2p}" \
-            --valid_data_path_and_name_and_type "${_asr_dev_dir}/${_scp},speech,${_type}" \
-            --valid_data_path_and_name_and_type "${_asr_dev_dir}/text,text,text" \
-            --valid_shape_file "${asr_stats_dir}/valid/speech_shape" \
-            --valid_shape_file "${asr_stats_dir}/valid/text_shape.${token_type}" \
-            --resume true \
-            --fold_length "${_fold_length}" \
-            --fold_length "${asr_text_fold_length}" \
-            --output_dir "${asr_exp}" \
-            ${_opts} ${asr_args}
+        log "ASR training started... log: '${asr_exp}/train.log'"
+        # shellcheck disable=SC2086
+        python3 -m espnet2.bin.launch \
+            --cmd "${cuda_cmd} --name ${asr_exp}/train.log" \
+            --log "${asr_exp}"/train.log \
+            --ngpu "${ngpu}" \
+            --num_nodes "${num_nodes}" \
+            --init_file_prefix "${asr_exp}"/.dist_init_ \
+            --multiprocessing_distributed true -- \
+            python3 -m espnet2.bin.asr_train \
+                --use_preprocessor true \
+                --bpemodel "${bpemodel}" \
+                --token_type "${token_type}" \
+                --token_list "${token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --valid_data_path_and_name_and_type "${_asr_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_asr_valid_dir}/text,text,text" \
+                --valid_shape_file "${asr_stats_dir}/valid/speech_shape" \
+                --valid_shape_file "${asr_stats_dir}/valid/text_shape.${token_type}" \
+                --resume true \
+                --fold_length "${_fold_length}" \
+                --fold_length "${asr_text_fold_length}" \
+                --output_dir "${asr_exp}" \
+                ${_opts} ${asr_args}
 
+    fi
+else
+    log "Skip the training stages"
 fi
 
 
@@ -941,7 +963,7 @@ if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then
         fi
     fi
 
-    for dset in "${dev_set}" ${eval_sets}; do
+    for dset in ${test_sets}; do
         _data="${data_feats}/${dset}"
         _dir="${asr_exp}/decode_${dset}_${decode_tag}"
         _logdir="${_dir}/logdir"
@@ -996,7 +1018,7 @@ if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
         exit 1
     fi
 
-    for dset in "${dev_set}" ${eval_sets}; do
+    for dset in ${test_sets}; do
         _data="${data_feats}/${dset}"
         _dir="${asr_exp}/decode_${dset}_${decode_tag}"
 
@@ -1103,13 +1125,14 @@ if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
 fi
 
 
+packed_model="${asr_exp}/${asr_exp##*/}_${decode_asr_model%.*}.zip"
 if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
-    log "[Option] Stage 13: Pack model: ${asr_exp}/packed.tgz"
+    log "[Option] Stage 13: Pack model: ${packed_model}"
 
     _opts=
     if "${use_lm}"; then
-        _opts+="--lm_train_config.yaml ${lm_exp}/config.yaml "
-        _opts+="--lm_file.pth ${lm_exp}/${decode_lm} "
+        _opts+="--lm/config.yaml ${lm_exp}/config.yaml "
+        _opts+="--lm/pretrain.pth ${lm_exp}/${decode_lm} "
     fi
     if [ "${feats_normalize}" = global_mvn ]; then
         _opts+="--option ${asr_stats_dir}/train/feats_stats.npz "
@@ -1119,12 +1142,75 @@ if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
     fi
     # shellcheck disable=SC2086
     python -m espnet2.bin.pack asr \
-        --asr_train_config.yaml "${asr_exp}"/config.yaml \
-        --asr_model_file.pth "${asr_exp}"/"${decode_asr_model}" \
+        --dirname "$(basename ${packed_model} .zip)" \
+        --asr/config.yaml "${asr_exp}"/config.yaml \
+        --asr/pretrain.pth "${asr_exp}"/"${decode_asr_model}" \
         ${_opts} \
         --option "${asr_exp}"/RESULTS.md \
-        --outpath "${asr_exp}/packed.tgz"
+        --outpath "${packed_model}"
+
+    # NOTE(kamo): If you'll use packed model to decode in this script, do as follows
+    #   % unzip ${packed_model}
+    #   % exp=$(basename ${packed_model} .zip)
+    #   % ./run.sh --skip_data_prep <true|false> --skip_train true --asr_exp ${exp}/asr --decode_asr_model pretrain.pth --lm_exp ${exp}/lm --decode_lm pretrain.pth
+fi
 
+
+if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
+    log "[Option] Stage 14: Upload model to Zenodo: ${packed_model}"
+
+    # To upload your model, you need to do:
+    #   1. Sign up to Zenodo: https://zenodo.org/
+    #   2. Create access token: https://zenodo.org/account/settings/applications/tokens/new/
+    #   3. Set your environment: % export ACCESS_TOKEN="<your token>"
+
+    if command -v git &> /dev/null; then
+        _creator_name="$(git config user.name)"
+        _checkout="
+git checkout $(git show -s --format=%H)"
+
+    else
+        _creator_name="$(whoami)"
+        _checkout=""
+    fi
+    # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
+    _task="$(pwd | rev | cut -d/ -f2 | rev)"
+    # foo/asr1 -> foo
+    _corpus="${_task%/*}"
+
+    # Generate description file
+    cat << EOF > "${asr_exp}"/description
+This model was trained by ${_creator_name} using ${_task} recipe in <a href="https://github.com/espnet/espnet/">espnet</a>.
+<p>&nbsp;</p>
+<ul>
+<li><strong>Python API</strong><pre><code class="language-python">Coming soon...</code></pre></li>
+<li><strong>Evaluate in the recipe</strong><pre>
+<code class="language-bash">git clone https://github.com/espnet/espnet
+cd espnet${_checkout}
+pip install -e .
+cd $(pwd | rev | cut -d/ -f1-3 | rev)
+# Download the model file here
+unzip $(basename ${packed_model})
+./run.sh --skip_data_prep false --skip_train true --asr_exp $(basename ${packed_model} .zip)/asr --decode_asr_model pretrain.pth --lm_exp $(basename ${packed_model} .zip)/lm --decode_lm pretrain.pth</code>
+</pre></li>
+<li><strong>Results</strong><pre><code>$(cat "${asr_exp}"/RESULTS.md)</code></pre></li>
+<li><strong>ASR config</strong><pre><code>$(cat "${asr_exp}"/config.yaml)</code></pre></li>
+<li><strong>LM config</strong><pre><code>$(if ${use_lm}; then cat "${lm_exp}"/config.yaml; else echo NONE; fi)</code></pre></li>
+</ul>
+EOF
+
+    # NOTE(kamo): The model file is uploaded here, but not published yet.
+    #   Please confirm your record at Zenodo and publish it by youself.
+
+    # shellcheck disable=SC2086
+    python -m espnet2.bin.zenodo_upload \
+        --file "${packed_model}" \
+        --title "ESPnet2 pretrained model, ${_creator_name}/${_corpus}_$(basename ${packed_model} .zip), fs=${fs}, lang=${lang}" \
+        --description_file ${asr_exp}/description \
+        --creator_name "${_creator_name}" \
+        --license "CC-BY-4.0" \
+        --use_sandbox false \
+        --publish false
 fi
 
 log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/get_yaml.py b/egs2/TEMPLATE/asr1/pyscripts/utils/get_yaml.py
new file mode 120000
index 00000000000..5b4280d561b
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/get_yaml.py
@@ -0,0 +1 @@
+../../../../../utils/get_yaml.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/tts1/tts.sh b/egs2/TEMPLATE/tts1/tts.sh
index d5d2f2b7c23..035a9c5ec1f 100755
--- a/egs2/TEMPLATE/tts1/tts.sh
+++ b/egs2/TEMPLATE/tts1/tts.sh
@@ -26,15 +26,17 @@ min() {
 SECONDS=0
 
 # General configuration
-stage=1          # Processes starts from the specified stage.
-stop_stage=7     # Processes is stopped at the specified stage.
-ngpu=1           # The number of gpus ("0" uses cpu, otherwise use gpu).
-num_nodes=1      # The number of nodes
-nj=32            # The number of parallel jobs.
-decode_nj=32     # The number of parallel jobs in decoding.
-gpu_decode=false # Whether to perform gpu decoding.
-dumpdir=dump     # Directory to dump features.
-expdir=exp       # Directory to save experiments.
+stage=1              # Processes starts from the specified stage.
+stop_stage=7         # Processes is stopped at the specified stage.
+skip_data_prep=false # Skip data preparation stages
+skip_train=false     # Skip training stages
+ngpu=1               # The number of gpus ("0" uses cpu, otherwise use gpu).
+num_nodes=1          # The number of nodes
+nj=32                # The number of parallel jobs.
+decode_nj=32         # The number of parallel jobs in decoding.
+gpu_decode=false     # Whether to perform gpu decoding.
+dumpdir=dump         # Directory to dump features.
+expdir=exp           # Directory to save experiments.
 
 # Data preparation related
 local_data_opts= # Options to be passed to local/data.sh.
@@ -58,11 +60,13 @@ blank="<blank>"     # CTC blank symbol
 sos_eos="<sos/eos>" # sos and eos symbole
 
 # Training related
-train_config= # Config for training.
-train_args=   # Arguments for training, e.g., "--max_epoch 1".
-              # Note that it will overwrite args in train config.
-tag=""        # Suffix for training directory.
-num_splits=1  # Number of splitting for tts corpus
+train_config=      # Config for training.
+train_args=        # Arguments for training, e.g., "--max_epoch 1".
+                   # Note that it will overwrite args in train config.
+tag=""             # Suffix for training directory.
+tts_exp=           # Specify the direcotry path for experiment. If this option is specified, tag is ignored.
+num_splits=1       # Number of splitting for tts corpus
+teacher_dumpdir="" # Directory of teacher outputs (needed if tts=fastspeech).
 
 # Decoding related
 decode_config= # Config for decoding.
@@ -78,30 +82,33 @@ griffin_lim_iters=4 # the number of iterations of Griffin-Lim.
 
 # [Task dependent] Set the datadir name created by local/data.sh
 train_set=       # Name of training set.
-dev_set=         # Name of development set.
-eval_sets=       # Names of evaluation sets. Multiple items can be specified.
+valid_set=       # Name of validation set used for monitoring/tuning network training
+test_sets=       # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
 srctexts=        # Texts to create token list. Multiple items can be specified.
 nlsyms_txt=none  # Non-linguistic symbol list (needed if existing).
 token_type=phn   # Transcription type.
 cleaner=tacotron # Text cleaner.
 g2p=g2p_en       # g2p method (needed if token_type=phn).
+lang=noinfo      # The language type of corpus
 text_fold_length=150   # fold_length for text data
 speech_fold_length=800 # fold_length for speech data
 
 help_message=$(cat << EOF
-Usage: $0 --train-set "<train_set_name>" --dev-set "<dev_set_name>" --eval_sets "<eval_set_names>" --srctexts "<srctexts>"
+Usage: $0 --train-set "<train_set_name>" --valid-set "<valid_set_name>" --test_sets "<test_set_names>" --srctexts "<srctexts>"
 
 Options:
     # General configuration
-    --stage      # Processes starts from the specified stage (default="${stage}").
-    --stop_stage # Processes is stopped at the specified stage (default="${stop_stage}").
-    --ngpu       # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
-    --num_nodes  # The number of nodes
-    --nj         # The number of parallel jobs (default="${nj}").
-    --decode_nj  # The number of parallel jobs in decoding (default="${decode_nj}").
-    --gpu_decode # Whether to perform gpu decoding (default="${gpu_decode}").
-    --dumpdir    # Directory to dump features (default="${dumpdir}").
-    --expdir     # Directory to save experiments (default="${expdir}").
+    --stage          # Processes starts from the specified stage (default="${stage}").
+    --stop_stage     # Processes is stopped at the specified stage (default="${stop_stage}").
+    --skip_data_prep # Skip data preparation stages (default="${skip_data_prep}").
+    --skip_train     # Skip training stages (default="${skip_train}").
+    --ngpu           # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
+    --num_nodes      # The number of nodes
+    --nj             # The number of parallel jobs (default="${nj}").
+    --decode_nj      # The number of parallel jobs in decoding (default="${decode_nj}").
+    --gpu_decode     # Whether to perform gpu decoding (default="${gpu_decode}").
+    --dumpdir        # Directory to dump features (default="${dumpdir}").
+    --expdir         # Directory to save experiments (default="${expdir}").
 
     # Data prep related
     --local_data_opts # Options to be passed to local/data.sh (default="${local_data_opts}").
@@ -127,6 +134,7 @@ Options:
     --train_args   # Arguments for training, e.g., "--max_epoch 1" (default="${train_args}").
                    # Note that it will overwrite args in train config.
     --tag          # Suffix for training directory (default="${tag}").
+    --tts_exp      # Specify the direcotry path for experiment. If this option is specified, tag is ignored (default="${tts_exp}").
     --num_splits   # Number of splitting for tts corpus (default="${num_splits}").
 
     # Decoding related
@@ -138,16 +146,17 @@ Options:
     --griffin_lim_iters # The number of iterations of Griffin-Lim (default=${griffin_lim_iters}).
 
     # [Task dependent] Set the datadir name created by local/data.sh.
-    --train_set  # Name of training set (required).
-    --dev_set    # Name of development set (required).
-    --eval_sets  # Names of evaluation sets (required).
-                 # Note that multiple items can be specified.
-    --srctexts   # Texts to create token list (required).
-                 # Note that multiple items can be specified.
-    --nlsyms_txt # Non-linguistic symbol list (default="${nlsyms_txt}").
-    --token_type # Transcription type (default="${token_type}").
-    --cleaner    # Text cleaner (default="${cleaner}").
-    --g2p        # g2p method (default="${g2p}").
+    --train_set         # Name of training set (required).
+    --valid_set         # Name of validation set used for monitoring/tuning network training (required).
+    --test_sets         # Names of test sets (required).
+                        # Note that multiple items (e.g., both dev and eval sets) can be specified.
+    --srctexts          # Texts to create token list (required).
+                        # Note that multiple items can be specified.
+    --nlsyms_txt        # Non-linguistic symbol list (default="${nlsyms_txt}").
+    --token_type        # Transcription type (default="${token_type}").
+    --cleaner           # Text cleaner (default="${cleaner}").
+    --g2p               # g2p method (default="${g2p}").
+    --lang              # The language type of corpus (default=${lang}).
     --text_fold_length   # fold_length for text data
     --speech_fold_length # fold_length for speech data
 EOF
@@ -228,376 +237,441 @@ if [ "${token_type}" = phn ]; then
     tts_stats_dir+="_${g2p}"
 fi
 # The directory used for training commands
-tts_exp="${expdir}/tts_${tag}"
+if [ -z "${tts_exp}" ]; then
+    tts_exp="${expdir}/tts_${tag}"
+fi
 
 
 # ========================== Main stages start from here. ==========================
 
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    log "Stage 1: Data preparation for data/${train_set}, data/${dev_set}, etc."
-    # [Task dependent] Need to create data.sh for new corpus
-    local/data.sh ${local_data_opts}
-fi
+if ! "${skip_data_prep}"; then
+    if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+        log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc."
+        # [Task dependent] Need to create data.sh for new corpus
+        local/data.sh ${local_data_opts}
+    fi
 
 
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # TODO(kamo): Change kaldi-ark to npy or HDF5?
-    # ====== Recreating "wav.scp" ======
-    # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
-    # shouldn't be used in training process.
-    # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
-    # and also it can also change the audio-format and sampling rate.
-    # If nothing is need, then format_wav_scp.sh does nothing:
-    # i.e. the input file format and rate is same as the output.
-
-    if [ "${feats_type}" = raw ]; then
-        log "Stage 2: Format wav.scp: data/ -> ${data_feats}/"
-        for dset in "${train_set}" "${dev_set}" ${eval_sets}; do
-            if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${dev_set}" ]; then
-                _suf="/org"
-            else
-                _suf=""
-            fi
-            utils/copy_data_dir.sh data/"${dset}" "${data_feats}${_suf}/${dset}"
-            rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel}
-            _opts=
-            if [ -e data/"${dset}"/segments ]; then
-                _opts+="--segments data/${dset}/segments "
-            fi
-            # shellcheck disable=SC2086
-            scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
-                --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
-                "data/${dset}/wav.scp" "${data_feats}${_suf}/${dset}"
-            echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
-        done
+    if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+        # TODO(kamo): Change kaldi-ark to npy or HDF5?
+        # ====== Recreating "wav.scp" ======
+        # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
+        # shouldn't be used in training process.
+        # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
+        # and also it can also change the audio-format and sampling rate.
+        # If nothing is need, then format_wav_scp.sh does nothing:
+        # i.e. the input file format and rate is same as the output.
+
+        if [ "${feats_type}" = raw ]; then
+            log "Stage 2: Format wav.scp: data/ -> ${data_feats}/"
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                utils/copy_data_dir.sh data/"${dset}" "${data_feats}${_suf}/${dset}"
+                rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel}
+                _opts=
+                if [ -e data/"${dset}"/segments ]; then
+                    _opts+="--segments data/${dset}/segments "
+                fi
+                # shellcheck disable=SC2086
+                scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                    --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
+                    "data/${dset}/wav.scp" "${data_feats}${_suf}/${dset}"
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+
+        elif [ "${feats_type}" = fbank ] || [ "${feats_type}" = stft ] ; then
+            log "Stage 2: ${feats_type} extract: data/ -> ${data_feats}/"
+
+            # Generate the fbank features; by default 80-dimensional fbanks on each frame
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                # 1. Copy datadir
+                utils/copy_data_dir.sh data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                # 2. Feature extract
+                # TODO(kamo): Wrap (nj->_nj) in make_fbank.sh
+                _nj=$(min "${nj}" "$(<${data_feats}${_suf}/${dset}/utt2spk wc -l)")
+                _opts=
+                if [ "${feats_type}" = fbank ] ; then
+                    _opts+="--fmax ${fmax} "
+                    _opts+="--fmin ${fmin} "
+                    _opts+="--n_mels ${n_mels} "
+                fi
+
+                # shellcheck disable=SC2086
+                scripts/feats/make_"${feats_type}".sh --cmd "${train_cmd}" --nj "${_nj}" \
+                    --fs "${fs}" \
+                    --n_fft "${n_fft}" \
+                    --n_shift "${n_shift}" \
+                    --win_length "${win_length}" \
+                    ${_opts} \
+                    "${data_feats}${_suf}/${dset}"
+                utils/fix_data_dir.sh "${data_feats}${_suf}/${dset}"
+
+                # 3. Derive the the frame length and feature dimension
+                scripts/feats/feat_to_shape.sh --nj "${_nj}" --cmd "${train_cmd}" \
+                    "${data_feats}${_suf}/${dset}/feats.scp" "${data_feats}${_suf}/${dset}/feats_shape"
+
+                # 4. Write feats_dim
+                head -n 1 "${data_feats}${_suf}/${dset}/feats_shape" | awk '{ print $2 }' \
+                    | cut -d, -f2 > ${data_feats}${_suf}/${dset}/feats_dim
+
+                # 5. Write feats_type
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+        fi
+    fi
 
-    elif [ "${feats_type}" = fbank ] || [ "${feats_type}" = stft ] ; then
-        log "Stage 2: ${feats_type} extract: data/ -> ${data_feats}/"
 
-        # Generate the fbank features; by default 80-dimensional fbanks on each frame
-        for dset in "${train_set}" "${dev_set}" ${eval_sets}; do
-            if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${dev_set}" ]; then
-                _suf="/org"
+    if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+        log "Stage 3: Remove long/short data: ${data_feats}/org -> ${data_feats}"
+
+        # NOTE(kamo): Not applying to test_sets to keep original data
+        for dset in "${train_set}" "${valid_set}"; do
+            # Copy data dir
+            utils/copy_data_dir.sh "${data_feats}/org/${dset}" "${data_feats}/${dset}"
+            cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
+
+            # Remove short utterances
+            _feats_type="$(<${data_feats}/${dset}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
+                _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
+
+                # utt2num_samples is created by format_wav_scp.sh
+                <"${data_feats}/org/${dset}/utt2num_samples" \
+                    awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
+                        >"${data_feats}/${dset}/utt2num_samples"
+                <"${data_feats}/org/${dset}/wav.scp" \
+                    utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
+                    >"${data_feats}/${dset}/wav.scp"
             else
-                _suf=""
+                # Get frame shift in ms from conf/fbank.conf
+                _frame_shift=
+                if [ -f conf/fbank.conf ] && [ "$(<conf/fbank.conf grep -c frame-shift)" -gt 0 ]; then
+                    # Assume using conf/fbank.conf for feature extraction
+                    _frame_shift="$(<conf/fbank.conf grep frame-shift | sed -e 's/[-a-z =]*\([0-9]*\)/\1/g')"
+                fi
+                if [ -z "${_frame_shift}" ]; then
+                    # If not existing, use the default number in Kaldi (=10ms).
+                    # If you are using different number, you have to change the following value manually.
+                    _frame_shift=10
+                fi
+
+                _min_length=$(python3 -c "print(int(${min_wav_duration} / ${_frame_shift} * 1000))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} / ${_frame_shift} * 1000))")
+
+                cp "${data_feats}/org/${dset}/feats_dim" "${data_feats}/${dset}/feats_dim"
+                <"${data_feats}/org/${dset}/feats_shape" awk -F, ' { print $1 } ' \
+                    | awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length) print $0; }' \
+                        >"${data_feats}/${dset}/feats_shape"
+                <"${data_feats}/org/${dset}/feats.scp" \
+                    utils/filter_scp.pl "${data_feats}/${dset}/feats_shape"  \
+                    >"${data_feats}/${dset}/feats.scp"
             fi
-            # 1. Copy datadir
-            utils/copy_data_dir.sh data/"${dset}" "${data_feats}${_suf}/${dset}"
-
-            # 2. Feature extract
-            # TODO(kamo): Wrap (nj->_nj) in make_fbank.sh
-            _nj=$(min "${nj}" "$(<${data_feats}${_suf}/${dset}/utt2spk wc -l)")
-            _opts=
-            if [ "${feats_type}" = fbank ] ; then
-                _opts+="--fmax ${fmax} "
-                _opts+="--fmin ${fmin} "
-                _opts+="--n_mels ${n_mels} "
-            fi
-
-            # shellcheck disable=SC2086
-            scripts/feats/make_"${feats_type}".sh --cmd "${train_cmd}" --nj "${_nj}" \
-                --fs "${fs}" \
-                --n_fft "${n_fft}" \
-                --n_shift "${n_shift}" \
-                --win_length "${win_length}" \
-                ${_opts} \
-                "${data_feats}${_suf}/${dset}"
-            utils/fix_data_dir.sh "${data_feats}${_suf}/${dset}"
-
-            # 3. Derive the the frame length and feature dimension
-            scripts/feats/feat_to_shape.sh --nj "${_nj}" --cmd "${train_cmd}" \
-                "${data_feats}${_suf}/${dset}/feats.scp" "${data_feats}${_suf}/${dset}/feats_shape"
-
-            # 4. Write feats_dim
-            head -n 1 "${data_feats}${_suf}/${dset}/feats_shape" | awk '{ print $2 }' \
-                | cut -d, -f2 > ${data_feats}${_suf}/${dset}/feats_dim
-
-            # 5. Write feats_type
-            echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
-        done
-    fi
-fi
 
+            # Remove empty text
+            <"${data_feats}/org/${dset}/text" \
+                awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
 
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    log "Stage 3: Remove long/short data: ${data_feats}/org -> ${data_feats}"
-
-    # NOTE(kamo): Not applying to eval_sets to keep original data
-    for dset in "${train_set}" "${dev_set}"; do
-        # Copy data dir
-        utils/copy_data_dir.sh "${data_feats}/org/${dset}" "${data_feats}/${dset}"
-        cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
+            # fix_data_dir.sh leaves only utts which exist in all files
+            utils/fix_data_dir.sh "${data_feats}/${dset}"
+        done
 
-        # Remove short utterances
-        _feats_type="$(<${data_feats}/${dset}/feats_type)"
-        if [ "${_feats_type}" = raw ]; then
-            _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
-            _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
-            _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
-
-            # utt2num_samples is created by format_wav_scp.sh
-            <"${data_feats}/org/${dset}/utt2num_samples" \
-                awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
-                    '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
-                    >"${data_feats}/${dset}/utt2num_samples"
-            <"${data_feats}/org/${dset}/wav.scp" \
-                utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
-                >"${data_feats}/${dset}/wav.scp"
-        else
-            # Get frame shift in ms from conf/fbank.conf
-            _frame_shift=
-            if [ -f conf/fbank.conf ] && [ "$(<conf/fbank.conf grep -c frame-shift)" -gt 0 ]; then
-                # Assume using conf/fbank.conf for feature extraction
-                _frame_shift="$(<conf/fbank.conf grep frame-shift | sed -e 's/[-a-z =]*\([0-9]*\)/\1/g')"
-            fi
-            if [ -z "${_frame_shift}" ]; then
-                # If not existing, use the default number in Kaldi (=10ms).
-                # If you are using different number, you have to change the following value manually.
-                _frame_shift=10
-            fi
+        # shellcheck disable=SC2002
+        cat ${srctexts} | awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/srctexts"
+    fi
 
-            _min_length=$(python3 -c "print(int(${min_wav_duration} / ${_frame_shift} * 1000))")
-            _max_length=$(python3 -c "print(int(${max_wav_duration} / ${_frame_shift} * 1000))")
-
-            cp "${data_feats}/org/${dset}/feats_dim" "${data_feats}/${dset}/feats_dim"
-            <"${data_feats}/org/${dset}/feats_shape" awk -F, ' { print $1 } ' \
-                | awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
-                    '{ if ($2 > min_length && $2 < max_length) print $0; }' \
-                    >"${data_feats}/${dset}/feats_shape"
-            <"${data_feats}/org/${dset}/feats.scp" \
-                utils/filter_scp.pl "${data_feats}/${dset}/feats_shape"  \
-                >"${data_feats}/${dset}/feats.scp"
-        fi
 
-        # Remove empty text
-        <"${data_feats}/org/${dset}/text" \
-            awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
+    if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+        log "Stage 4: Generate token_list from ${srctexts}"
+        # "nlsyms_txt" should be generated by local/data.sh if need
 
-        # fix_data_dir.sh leaves only utts which exist in all files
-        utils/fix_data_dir.sh "${data_feats}/${dset}"
-    done
+        # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+        # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
 
-    # shellcheck disable=SC2002
-    cat ${srctexts} | awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/srctexts"
+        python3 -m espnet2.bin.tokenize_text \
+              --token_type "${token_type}" -f 2- \
+              --input "${data_feats}/srctexts" --output "${token_list}" \
+              --non_linguistic_symbols "${nlsyms_txt}" \
+              --cleaner "${cleaner}" \
+              --g2p "${g2p}" \
+              --write_vocabulary true \
+              --add_symbol "${blank}:0" \
+              --add_symbol "${oov}:1" \
+              --add_symbol "${sos_eos}:-1"
+    fi
+else
+    log "Skip the stages for data preparation"
 fi
 
+# ========================== Data preparation is done here. ==========================
 
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-    log "Stage 4: Generate token_list from ${srctexts}"
-    # "nlsyms_txt" should be generated by local/data.sh if need
-
-    # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
-    # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
 
-    python3 -m espnet2.bin.tokenize_text \
-          --token_type "${token_type}" -f 2- \
-          --input "${data_feats}/srctexts" --output "${token_list}" \
-          --non_linguistic_symbols "${nlsyms_txt}" \
-          --cleaner "${cleaner}" \
-          --g2p "${g2p}" \
-          --write_vocabulary true \
-          --add_symbol "${blank}:0" \
-          --add_symbol "${oov}:1" \
-          --add_symbol "${sos_eos}:-1"
-fi
 
-# ========================== Data preparation is done here. ==========================
+if ! "${skip_train}"; then
+    if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+        _train_dir="${data_feats}/${train_set}"
+        _valid_dir="${data_feats}/${valid_set}"
+        log "Stage 5: TTS collect stats: train_set=${_train_dir}, valid_set=${_valid_dir}"
 
+        _opts=
+        if [ -n "${train_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.tts_train --print_config --optim adam
+            _opts+="--config ${train_config} "
+        fi
 
+        _feats_type="$(<${_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            # "sound" supports "wav", "flac", etc.
+            _type=sound
+            _opts+="--feats_extract fbank "
+            _opts+="--feats_extract_conf fs=${fs} "
+            _opts+="--feats_extract_conf n_fft=${n_fft} "
+            _opts+="--feats_extract_conf fmin=${fmin} "
+            _opts+="--feats_extract_conf fmax=${fmax} "
+            _opts+="--feats_extract_conf n_mels=${n_mels} "
+            _opts+="--feats_extract_conf hop_length=${n_shift} "
+            _opts+="--feats_extract_conf win_length=${win_length} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _odim="$(<${_train_dir}/feats_dim)"
+            _opts+="--odim=${_odim} "
+        fi
 
-if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-    _train_dir="${data_feats}/${train_set}"
-    _dev_dir="${data_feats}/${dev_set}"
-    log "Stage 5: TTS collect stats: train_set=${_train_dir}, dev_set=${_dev_dir}"
+        # 1. Split the key file
+        _logdir="${tts_stats_dir}/logdir"
+        mkdir -p "${_logdir}"
 
-    _opts=
-    if [ -n "${train_config}" ]; then
-        # To generate the config file: e.g.
-        #   % python3 -m espnet2.bin.tts_train --print_config --optim adam
-        _opts+="--config ${train_config} "
-    fi
+        # Get the minimum number among ${nj} and the number lines of input files
+        _nj=$(min "${nj}" "$(<${_train_dir}/${_scp} wc -l)" "$(<${_valid_dir}/${_scp} wc -l)")
 
-    _feats_type="$(<${_train_dir}/feats_type)"
-    if [ "${_feats_type}" = raw ]; then
-        _scp=wav.scp
-        # "sound" supports "wav", "flac", etc.
-        _type=sound
-        _opts+="--feats_extract fbank "
-        _opts+="--feats_extract_conf fs=${fs} "
-        _opts+="--feats_extract_conf n_fft=${n_fft} "
-        _opts+="--feats_extract_conf fmin=${fmin} "
-        _opts+="--feats_extract_conf fmax=${fmax} "
-        _opts+="--feats_extract_conf n_mels=${n_mels} "
-        _opts+="--feats_extract_conf hop_length=${n_shift} "
-        _opts+="--feats_extract_conf win_length=${win_length} "
-    else
-        _scp=feats.scp
-        _type=kaldi_ark
-        _odim="$(<${_train_dir}/feats_dim)"
-        _opts+="--odim=${_odim} "
-    fi
+        key_file="${_train_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/train.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
 
-    # 1. Split the key file
-    _logdir="${tts_stats_dir}/logdir"
-    mkdir -p "${_logdir}"
+        key_file="${_valid_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/valid.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
 
-    # Get the minimum number among ${nj} and the number lines of input files
-    _nj=$(min "${nj}" "$(<${_train_dir}/${_scp} wc -l)" "$(<${_dev_dir}/${_scp} wc -l)")
+        # 2. Submit jobs
+        log "TTS collect_stats started... log: '${_logdir}/stats.*.log'"
+        # shellcheck disable=SC2086
+        ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+            python3 -m espnet2.bin.tts_train \
+                --collect_stats true \
+                --use_preprocessor true \
+                --token_type "${token_type}" \
+                --token_list "${token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --normalize none \
+                --train_data_path_and_name_and_type "${_train_dir}/text,text,text" \
+                --train_data_path_and_name_and_type "${_train_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_valid_dir}/text,text,text" \
+                --valid_data_path_and_name_and_type "${_valid_dir}/${_scp},speech,${_type}" \
+                --train_shape_file "${_logdir}/train.JOB.scp" \
+                --valid_shape_file "${_logdir}/valid.JOB.scp" \
+                --output_dir "${_logdir}/stats.JOB" \
+                ${_opts} ${train_args}
+
+        # 3. Aggregate shape files
+        _opts=
+        for i in $(seq "${_nj}"); do
+            _opts+="--input_dir ${_logdir}/stats.${i} "
+        done
+        python3 -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${tts_stats_dir}"
 
-    key_file="${_train_dir}/${_scp}"
-    split_scps=""
-    for n in $(seq "${_nj}"); do
-        split_scps+=" ${_logdir}/train.${n}.scp"
-    done
-    # shellcheck disable=SC2086
-    utils/split_scp.pl "${key_file}" ${split_scps}
+        # Append the num-tokens at the last dimensions. This is used for batch-bins count
+        <"${tts_stats_dir}/train/text_shape" \
+            awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
+            >"${tts_stats_dir}/train/text_shape.${token_type}"
 
-    key_file="${_dev_dir}/${_scp}"
-    split_scps=""
-    for n in $(seq "${_nj}"); do
-        split_scps+=" ${_logdir}/dev.${n}.scp"
-    done
-    # shellcheck disable=SC2086
-    utils/split_scp.pl "${key_file}" ${split_scps}
+        <"${tts_stats_dir}/valid/text_shape" \
+            awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
+            >"${tts_stats_dir}/valid/text_shape.${token_type}"
+    fi
 
-    # 2. Submit jobs
-    log "TTS collect_stats started... log: '${_logdir}/stats.*.log'"
-    # shellcheck disable=SC2086
-    ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
-        python3 -m espnet2.bin.tts_train \
-            --collect_stats true \
-            --use_preprocessor true \
-            --token_type "${token_type}" \
-            --token_list "${token_list}" \
-            --non_linguistic_symbols "${nlsyms_txt}" \
-            --cleaner "${cleaner}" \
-            --g2p "${g2p}" \
-            --normalize none \
-            --train_data_path_and_name_and_type "${_train_dir}/text,text,text" \
-            --train_data_path_and_name_and_type "${_train_dir}/${_scp},speech,${_type}" \
-            --valid_data_path_and_name_and_type "${_dev_dir}/text,text,text" \
-            --valid_data_path_and_name_and_type "${_dev_dir}/${_scp},speech,${_type}" \
-            --train_shape_file "${_logdir}/train.JOB.scp" \
-            --valid_shape_file "${_logdir}/dev.JOB.scp" \
-            --output_dir "${_logdir}/stats.JOB" \
-            ${_opts} ${train_args}
-
-    # 3. Aggregate shape files
-    _opts=
-    for i in $(seq "${_nj}"); do
-        _opts+="--input_dir ${_logdir}/stats.${i} "
-    done
-    python3 -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${tts_stats_dir}"
-
-    # Append the num-tokens at the last dimensions. This is used for batch-bins count
-    <"${tts_stats_dir}/train/text_shape" \
-        awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
-        >"${tts_stats_dir}/train/text_shape.${token_type}"
-
-    <"${tts_stats_dir}/valid/text_shape" \
-        awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
-        >"${tts_stats_dir}/valid/text_shape.${token_type}"
-fi
 
+    if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+        _train_dir="${data_feats}/${train_set}"
+        _valid_dir="${data_feats}/${valid_set}"
+        log "Stage 6: TTS Training: train_set=${_train_dir}, valid_set=${_valid_dir}"
 
-if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
-    _train_dir="${data_feats}/${train_set}"
-    _dev_dir="${data_feats}/${dev_set}"
-    log "Stage 6: TTS Training: train_set=${_train_dir}, dev_set=${_dev_dir}"
+        _opts=
+        if [ -n "${train_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.tts_train --print_config --optim adam
+            _opts+="--config ${train_config} "
+        fi
 
-    _opts=
-    if [ -n "${train_config}" ]; then
-        # To generate the config file: e.g.
-        #   % python3 -m espnet2.bin.tts_train --print_config --optim adam
-        _opts+="--config ${train_config} "
-    fi
+        if [ -z "${teacher_dumpdir}" ]; then
+            # CASE 1: Standard training
+            _feats_type="$(<${_train_dir}/feats_type)"
+
+            if [ "${_feats_type}" = raw ]; then
+                _scp=wav.scp
+                # "sound" supports "wav", "flac", etc.
+                _type=sound
+                _fold_length="$((speech_fold_length * n_shift))"
+                _opts+="--feats_extract fbank "
+                _opts+="--feats_extract_conf fs=${fs} "
+                _opts+="--feats_extract_conf fmin=${fmin} "
+                _opts+="--feats_extract_conf fmax=${fmax} "
+                _opts+="--feats_extract_conf n_mels=${n_mels} "
+                _opts+="--feats_extract_conf hop_length=${n_shift} "
+                _opts+="--feats_extract_conf n_fft=${n_fft} "
+                _opts+="--feats_extract_conf win_length=${win_length} "
+            else
+                _scp=feats.scp
+                _type=kaldi_ark
+                _fold_length="${speech_fold_length}"
+                _odim="$(<${_train_dir}/feats_dim)"
+                _opts+="--odim=${_odim} "
+            fi
 
-    _feats_type="$(<${_train_dir}/feats_type)"
-    if [ "${_feats_type}" = raw ]; then
-        _scp=wav.scp
-        # "sound" supports "wav", "flac", etc.
-        _type=sound
-        _fold_length="$((speech_fold_length * n_shift))"
-        _opts+="--feats_extract fbank "
-        _opts+="--feats_extract_conf fs=${fs} "
-        _opts+="--feats_extract_conf fmin=${fmin} "
-        _opts+="--feats_extract_conf fmax=${fmax} "
-        _opts+="--feats_extract_conf n_mels=${n_mels} "
-        _opts+="--feats_extract_conf hop_length=${n_shift} "
-        _opts+="--feats_extract_conf n_fft=${n_fft} "
-        _opts+="--feats_extract_conf win_length=${win_length} "
-    else
-        _scp=feats.scp
-        _type=kaldi_ark
-        _fold_length="${speech_fold_length}"
-        _odim="$(<${_train_dir}/feats_dim)"
-        _opts+="--odim=${_odim} "
-    fi
+            if [ "${num_splits}" -gt 1 ]; then
+                # If you met a memory error when parsing text files, this option may help you.
+                # The corpus is split into subsets and each subset is used for training one by one in order,
+                # so the memory footprint can be limited to the memory required for each dataset.
+
+                _split_dir="${tts_stats_dir}/splits${num_splits}"
+                if [ ! -f "${_split_dir}/.done" ]; then
+                    rm -f "${_split_dir}/.done"
+                    python3 -m espnet2.bin.split_scps \
+                      --scps \
+                          "${_train_dir}/text" \
+                          "${_train_dir}/${_scp}" \
+                          "${tts_stats_dir}/train/speech_shape" \
+                          "${tts_stats_dir}/train/text_shape.${token_type}" \
+                      --num_splits "${num_splits}" \
+                      --output_dir "${_split_dir}"
+                    touch "${_split_dir}/.done"
+                else
+                    log "${_split_dir}/.done exists. Spliting is skipped"
+                fi
+
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/text,text,text "
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
+                _opts+="--train_shape_file ${_split_dir}/text_shape.${token_type} "
+                _opts+="--train_shape_file ${_split_dir}/speech_shape "
+                _opts+="--multiple_iterator true "
 
-    if [ "${num_splits}" -gt 1 ]; then
-        # If you met a memory error when parsing text files, this option may help you.
-        # The corpus is split into subsets and each subset is used for training one by one in order,
-        # so the memory footprint can be limited to the memory required for each dataset.
-
-        _split_dir="${tts_stats_dir}/splits${num_splits}"
-        if [ ! -f "${_split_dir}/.done" ]; then
-            rm -f "${_split_dir}/.done"
-            python3 -m espnet2.bin.split_scps \
-              --scps \
-                  "${_train_dir}/text" \
-                  "${_train_dir}/${_scp}" \
-                  "${tts_stats_dir}/train/speech_shape" \
-                  "${tts_stats_dir}/train/text_shape.${token_type}" \
-              --num_splits "${num_splits}" \
-              --output_dir "${_split_dir}"
-            touch "${_split_dir}/.done"
+            else
+                _opts+="--train_data_path_and_name_and_type ${_train_dir}/text,text,text "
+                _opts+="--train_data_path_and_name_and_type ${_train_dir}/${_scp},speech,${_type} "
+                _opts+="--train_shape_file ${tts_stats_dir}/train/text_shape.${token_type} "
+                _opts+="--train_shape_file ${tts_stats_dir}/train/speech_shape "
+            fi
+            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/text,text,text "
+            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/${_scp},speech,${_type} "
+            _opts+="--valid_shape_file ${tts_stats_dir}/valid/text_shape.${token_type} "
+            _opts+="--valid_shape_file ${tts_stats_dir}/valid/speech_shape "
         else
-            log "${_split_dir}/.done exists. Spliting is skipped"
-        fi
-
-        _opts+="--train_data_path_and_name_and_type ${_split_dir}/text,text,text "
-        _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
-        _opts+="--train_shape_file ${_split_dir}/text_shape.${token_type} "
-        _opts+="--train_shape_file ${_split_dir}/speech_shape "
-        _opts+="--multiple_iterator true "
+            # CASE 2: Knowledge distillation training
+            _teacher_train_dir="${teacher_dumpdir}/${train_set}"
+            _teacher_valid_dir="${teacher_dumpdir}/${valid_set}"
+            _scp=feats.scp
+            _type=npy
+            _fold_length="${speech_fold_length}"
+            _odim="$(head -n 1 "${_teacher_train_dir}/speech_shape" | cut -f 2 -d ",")"
+            _opts+="--odim=${_odim} "
+
+            if [ "${num_splits}" -gt 1 ]; then
+                # If you met a memory error when parsing text files, this option may help you.
+                # The corpus is split into subsets and each subset is used for training one by one in order,
+                # so the memory footprint can be limited to the memory required for each dataset.
+
+                _split_dir="${teacher_dumpdir}/splits${num_splits}"
+                if [ ! -f "${_split_dir}/.done" ]; then
+                    rm -f "${_split_dir}/.done"
+                    python3 -m espnet2.bin.split_scps \
+                      --scps \
+                          "${_train_dir}/text" \
+                          "${_teacher_train_dir}/denorm/${_scp}" \
+                          "${_teacher_train_dir}/speech_shape" \
+                          "${_teacher_train_dir}/durations" \
+                          "${_teacher_train_dir}/focus_rates" \
+                          "${tts_stats_dir}/text_shape.${token_type}" \
+                      --num_splits "${num_splits}" \
+                      --output_dir "${_split_dir}"
+                    touch "${_split_dir}/.done"
+                else
+                    log "${_split_dir}/.done exists. Spliting is skipped"
+                fi
+
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/text,text,text "
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
+                _opts+="--train_shape_file ${_split_dir}/text_shape.${token_type} "
+                _opts+="--train_shape_file ${_split_dir}/speech_shape "
+                _opts+="--multiple_iterator true "
 
-    else
-        _opts+="--train_data_path_and_name_and_type ${_train_dir}/text,text,text "
-        _opts+="--train_data_path_and_name_and_type ${_train_dir}/${_scp},speech,${_type} "
-        _opts+="--train_shape_file ${tts_stats_dir}/train/text_shape.${token_type} "
-        _opts+="--train_shape_file ${tts_stats_dir}/train/speech_shape "
-    fi
+            else
+                _opts+="--train_data_path_and_name_and_type ${_train_dir}/text,text,text "
+                _opts+="--train_data_path_and_name_and_type ${_teacher_train_dir}/denorm/${_scp},speech,${_type} "
+                _opts+="--train_data_path_and_name_and_type ${_teacher_train_dir}/durations,durations,text_int "
+                _opts+="--train_shape_file ${tts_stats_dir}/train/text_shape.${token_type} "
+                _opts+="--train_shape_file ${_teacher_train_dir}/speech_shape "
+            fi
+            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/text,text,text "
+            _opts+="--valid_data_path_and_name_and_type ${_teacher_valid_dir}/denorm/${_scp},speech,${_type} "
+            _opts+="--valid_data_path_and_name_and_type ${_teacher_valid_dir}/durations,durations,text_int "
+            _opts+="--valid_shape_file ${tts_stats_dir}/valid/text_shape.${token_type} "
+            _opts+="--valid_shape_file ${_teacher_valid_dir}/speech_shape "
+        fi
 
-    # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+        # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
 
-    log "TTS training started... log: '${tts_exp}/train.log'"
-    # shellcheck disable=SC2086
-    python3 -m espnet2.bin.launch \
-        --cmd "${cuda_cmd} --name ${tts_exp}/train.log" \
-        --log "${tts_exp}"/train.log \
-        --ngpu "${ngpu}" \
-        --num_nodes "${num_nodes}" \
-        --init_file_prefix "${tts_exp}"/.dist_init_ \
-        --multiprocessing_distributed true -- \
-        python3 -m espnet2.bin.tts_train \
-            --use_preprocessor true \
-            --token_type "${token_type}" \
-            --token_list "${token_list}" \
-            --non_linguistic_symbols "${nlsyms_txt}" \
-            --cleaner "${cleaner}" \
-            --g2p "${g2p}" \
-            --normalize global_mvn \
-            --normalize_conf "stats_file=${tts_stats_dir}/train/feats_stats.npz" \
-            --valid_data_path_and_name_and_type "${_dev_dir}/text,text,text" \
-            --valid_data_path_and_name_and_type "${_dev_dir}/${_scp},speech,${_type}" \
-            --valid_shape_file "${tts_stats_dir}/valid/text_shape.${token_type}" \
-            --valid_shape_file "${tts_stats_dir}/valid/speech_shape" \
-            --resume true \
-            --fold_length "${text_fold_length}" \
-            --fold_length "${_fold_length}" \
-            --output_dir "${tts_exp}" \
-            ${_opts} ${train_args}
+        log "TTS training started... log: '${tts_exp}/train.log'"
+        # shellcheck disable=SC2086
+        python3 -m espnet2.bin.launch \
+            --cmd "${cuda_cmd} --name ${tts_exp}/train.log" \
+            --log "${tts_exp}"/train.log \
+            --ngpu "${ngpu}" \
+            --num_nodes "${num_nodes}" \
+            --init_file_prefix "${tts_exp}"/.dist_init_ \
+            --multiprocessing_distributed true -- \
+            python3 -m espnet2.bin.tts_train \
+                --use_preprocessor true \
+                --token_type "${token_type}" \
+                --token_list "${token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --normalize global_mvn \
+                --normalize_conf "stats_file=${tts_stats_dir}/train/feats_stats.npz" \
+                --resume true \
+                --fold_length "${text_fold_length}" \
+                --fold_length "${_fold_length}" \
+                --output_dir "${tts_exp}" \
+                ${_opts} ${train_args}
 
+    fi
+else
+    log "Skip training stages"
 fi
 
 
@@ -617,27 +691,45 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
         _opts+="--config ${decode_config} "
     fi
 
-    _feats_type="$(<${data_feats}/${train_set}/feats_type)"
+    if [ -z "${teacher_dumpdir}" ]; then
+        _feats_type="$(<${data_feats}/${train_set}/feats_type)"
+    else
+        # TODO(kan-bayashi): Fix hard coding
+        _feats_type=fbank
+    fi
 
     # NOTE(kamo): If feats_type=raw, vocoder_conf is unnecessary
-    if [ "${_feats_type}" == fbank ] || [ "${_feats_type}" == stft ]; then
+    _scp=wav.scp
+    _type=sound
+    if [ "${_feats_type}" = fbank ] || [ "${_feats_type}" = stft ]; then
         _opts+="--vocoder_conf n_fft=${n_fft} "
         _opts+="--vocoder_conf n_shift=${n_shift} "
         _opts+="--vocoder_conf win_length=${win_length} "
         _opts+="--vocoder_conf fs=${fs} "
+        _scp=feats.scp
+        _type=kaldi_ark
     fi
-    if [ "${_feats_type}" == fbank ]; then
+    if [ "${_feats_type}" = fbank ]; then
         _opts+="--vocoder_conf n_mels=${n_mels} "
         _opts+="--vocoder_conf fmin=${fmin} "
         _opts+="--vocoder_conf fmax=${fmax} "
     fi
 
-    for dset in "${dev_set}" ${eval_sets}; do
+    for dset in ${test_sets}; do
         _data="${data_feats}/${dset}"
-        _dir="${tts_exp}/${decode_tag}_${dset}"
+        _speech_data="${_data}"
+        _dir="${tts_exp}/${decode_tag}/${dset}"
         _logdir="${_dir}/log"
         mkdir -p "${_logdir}"
 
+        # NOTE(kan-bayashi): Overwrite speech arguments if teacher dumpdir is provided
+        if [ -n "${teacher_dumpdir}" ]; then
+            # TODO(kan-bayashi): Make this part more flexible
+            _speech_data="${teacher_dumpdir}/${dset}/denorm"
+            _scp=feats.scp
+            _type=npy
+        fi
+
         # 0. Copy feats_type
         cp "${_data}/feats_type" "${_dir}/feats_type"
 
@@ -654,11 +746,11 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
         # 2. Submit decoding jobs
         log "Decoding started... log: '${_logdir}/tts_inference.*.log'"
         # shellcheck disable=SC2086
-        # NOTE(kan-bayashi): --key_file is useful when we want to use multiple data
         ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/tts_inference.JOB.log \
             python3 -m espnet2.bin.tts_inference \
                 --ngpu "${_ngpu}" \
                 --data_path_and_name_and_type "${_data}/text,text,text" \
+                --data_path_and_name_and_type ${_speech_data}/${_scp},speech,${_type} \
                 --key_file "${_logdir}"/keys.JOB.scp \
                 --model_file "${tts_exp}"/"${decode_model}" \
                 --train_config "${tts_exp}"/config.yaml \
@@ -667,34 +759,106 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
                 ${_opts} ${decode_args}
 
         # 3. Concatenates the output files from each jobs
-        mkdir -p "${_dir}"/{norm,denorm,wav,att_ws,probs}
+        mkdir -p "${_dir}"/{norm,denorm,wav}
         for i in $(seq "${_nj}"); do
              cat "${_logdir}/output.${i}/norm/feats.scp"
         done | LC_ALL=C sort -k1 > "${_dir}/norm/feats.scp"
         for i in $(seq "${_nj}"); do
              cat "${_logdir}/output.${i}/denorm/feats.scp"
         done | LC_ALL=C sort -k1 > "${_dir}/denorm/feats.scp"
+        for i in $(seq "${_nj}"); do
+             cat "${_logdir}/output.${i}/speech_shape/speech_shape"
+        done | LC_ALL=C sort -k1 > "${_dir}/speech_shape"
         for i in $(seq "${_nj}"); do
             mv -u "${_logdir}/output.${i}"/wav/*.wav "${_dir}"/wav
-            mv -u "${_logdir}/output.${i}"/att_ws/*.png "${_dir}"/att_ws
-            mv -u "${_logdir}/output.${i}"/probs/*.png "${_dir}"/probs
-            rm -rf "${_logdir}/output.${i}"/{wav,att_ws,probs}
+            rm -rf "${_logdir}/output.${i}"/wav
         done
-    done
-
-fi
+        if [ -e "${_logdir}/output.${_nj}/att_ws" ]; then
+            mkdir -p "${_dir}"/{att_ws,probs}
+            for i in $(seq "${_nj}"); do
+                 cat "${_logdir}/output.${i}/durations/durations"
+            done | LC_ALL=C sort -k1 > "${_dir}/durations"
+            for i in $(seq "${_nj}"); do
+                 cat "${_logdir}/output.${i}/focus_rates/focus_rates"
+            done | LC_ALL=C sort -k1 > "${_dir}/focus_rates"
+            for i in $(seq "${_nj}"); do
+                mv -u "${_logdir}/output.${i}"/att_ws/*.png "${_dir}"/att_ws
+                mv -u "${_logdir}/output.${i}"/probs/*.png "${_dir}"/probs
+                rm -rf "${_logdir}/output.${i}"/{att_ws,probs}
+            done
+        fi
+    done fi
 
 
+packed_model="${tts_exp}/${tts_exp##*/}_${decode_model%.*}.zip"
 if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
-    log "[Option] Stage 8: Pack model: ${tts_exp}/packed.tgz"
+    log "[Option] Stage 8: Pack model: ${packed_model}"
 
     python -m espnet2.bin.pack tts \
-        --train_config.yaml "${tts_exp}"/config.yaml \
-        --model_file.pth "${tts_exp}"/"${decode_model}" \
+        --dirname "$(basename ${packed_model} .zip)" \
+        --config.yaml "${tts_exp}"/config.yaml \
+        --pretrain.pth "${tts_exp}"/"${decode_model}" \
         --option ${tts_stats_dir}/train/feats_stats.npz  \
-        --outpath "${tts_exp}/packed.tgz"
+        --outpath "${packed_model}"
 
+    # NOTE(kamo): If you'll use packed model to decode in this script, do as follows
+    #   % unzip ${packed_model}
+    #   % ./run.sh --stage 8 --tts_exp $(basename ${packed_model} .zip) --decode_model pretrain.pth
 fi
 
 
+if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+    log "[Option] Stage 9: Upload model to Zenodo: ${packed_model}"
+
+    # To upload your model, you need to do:
+    #   1. Signup to Zenodo: https://zenodo.org/
+    #   2. Create access token: https://zenodo.org/account/settings/applications/tokens/new/
+    #   3. Set your environment: % export ACCESS_TOKEN="<your token>"
+
+    if command -v git &> /dev/null; then
+        _creator_name="$(git config user.name)"
+        _checkout="
+git checkout $(git show -s --format=%H)"
+    else
+        _creator_name="$(whoami)"
+        _checkout=""
+    fi
+    # /some/where/espnet/egs2/foo/tts1/ -> foo/tt1
+    _task="$(pwd | rev | cut -d/ -f1-2 | rev)"
+    # foo/asr1 -> foo
+    _corpus="${_task%/*}"
+
+    # Generate description file
+    cat << EOF > "${tts_exp}"/description
+This model was trained by ${_creator_name} using ${_task} recipe in <a href="https://github.com/espnet/espnet/">espnet</a>.
+<p>&nbsp;</p>
+<ul>
+<li><strong>Python API</strong><pre><code class="language-python">Coming soon...</code></pre></li>
+<li><strong>Evaluate in the recipe</strong><pre>
+<code class="language-bash">git clone https://github.com/espnet/espnet
+cd espnet${_checkout}
+pip install -e .
+cd $(pwd | rev | cut -d/ -f1-3 | rev)
+# Download the model file here
+unzip $(basename ${packed_model})
+./run.sh --skip_data_prep false --skip_train true --asr_exp $(basename ${packed_model} .zip)/asr --decode_asr_model pretrain.pth --lm_exp $(basename ${packed_model} .zip)/lm --decode_lm pretrain.pth</code>
+</pre></li>
+<li><strong>Config</strong><pre><code>$(cat "${tts_exp}"/config.yaml)</code></pre></li>
+</ul>
+EOF
+
+    # NOTE(kamo): The model file is uploaded here, but not published yet.
+    #   Please confirm your record at Zenodo and publish by youself.
+
+    # shellcheck disable=SC2086
+    python -m espnet2.bin.zenodo_upload \
+        --file "${packed_model}" \
+        --title "ESPnet2 pretrained model, ${_creator_name}/${_corpus}_$(basename ${packed_model} .zip), fs=${fs}, lang=${lang}" \
+        --description_file "${tts_exp}"/description \
+        --creator_name "${_creator_name}" \
+        --license "CC-BY-4.0" \
+        --use_sandbox false \
+        --publish false
+fi
+
 log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/aishell/asr1/README.md b/egs2/aishell/asr1/README.md
index 1e9b3261722..c291efd2020 100644
--- a/egs2/aishell/asr1/README.md
+++ b/egs2/aishell/asr1/README.md
@@ -14,10 +14,11 @@
 |decode_devdecode_asr_rnn_lm_valid.loss.best_asr_model_valid.acc.best|14326|205341|92.6|7.2|0.2|0.1|7.5|49.6|
 |decode_testdecode_asr_rnn_lm_valid.loss.best_asr_model_valid.acc.best|7176|104765|91.6|8.2|0.3|0.2|8.6|53.4|
 
-## asr_train_asr_transformer_fbank_pitch_char
+## asr_train_asr_transformer_lr0.002_fbank_pitch_char
 ### CER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_devdecode_asr_transformer_lm_valid.loss.best_asr_model_valid.acc.best|14326|205341|41.9|45.7|12.3|4.7|62.8|98.8|
-|decode_testdecode_asr_transformer_lm_valid.loss.best_asr_model_valid.acc.best|7176|104765|37.0|50.6|12.4|7.6|70.6|99.2|
\ No newline at end of file
+|decode_dev_decode_asr_rnn_lm_train_lm_char_valid.loss.best_asr_model_valid.acc.best|14326|205341|93.3|6.5|0.2|0.1|6.8|45.6|
+|decode_test_decode_asr_rnn_lm_train_lm_char_valid.loss.best_asr_model_valid.acc.best|7176|104765|92.7|7.1|0.3|0.1|7.4|47.6|
+
diff --git a/egs2/aishell/asr1/conf/train_asr_transformer.yaml b/egs2/aishell/asr1/conf/train_asr_transformer.yaml
deleted file mode 100644
index df1e9ef7ad9..00000000000
--- a/egs2/aishell/asr1/conf/train_asr_transformer.yaml
+++ /dev/null
@@ -1,66 +0,0 @@
-# network architecture
-# encoder related
-encoder: transformer
-encoder_conf:
-    output_size: 256    # dimension of attention
-    attention_heads: 4
-    linear_units: 2048  # the number of units of position-wise feed forward
-    num_blocks: 12      # the number of encoder blocks
-    dropout_rate: 0.1
-    positional_dropout_rate: 0.1
-    attention_dropout_rate: 0.0
-    input_layer: conv2d # encoder architecture type
-    normalize_before: true
-
-# decoder related
-decoder: transformer
-decoder_conf:
-    attention_heads: 4
-    linear_units: 2048
-    num_blocks: 6
-    dropout_rate: 0.1
-    positional_dropout_rate: 0.1
-    self_attention_dropout_rate: 0.0
-    src_attention_dropout_rate: 0.0
-
-# hybrid CTC/attention
-model_conf:
-    ctc_weight: 0.3
-    lsm_weight: 0.1     # label smoothing option
-    length_normalized_loss: false
-
-# minibatch related
-batch_type: folded
-batch_size: 32
-
-# optimization related
-accum_grad: 2
-grad_clip: 5
-patience: 3
-max_epoch: 20
-val_scheduler_criterion:
-    - valid
-    - acc
-best_model_criterion:
--   - valid
-    - acc
-    - max
-keep_nbest_models: 10
-
-# NoamLR is deprecated. Use WarmupLR.
-# The following is equivalent setting for NoamLR:
-#
-#    optim: adam
-#    optim_conf:
-#        lr: 10.
-#    scheduler: noamlr
-#    scheduler_conf:
-#        model_size: 256
-#        warmup_steps: 25000
-#
-optim: adam
-optim_conf:
-    lr: 0.00395
-scheduler: warmuplr     # pytorch v1.1.0+ required
-scheduler_conf:
-    warmup_steps: 25000
diff --git a/egs2/aishell/asr1/conf/train_asr_transformer.yaml b/egs2/aishell/asr1/conf/train_asr_transformer.yaml
new file mode 120000
index 00000000000..b9175b1a7fa
--- /dev/null
+++ b/egs2/aishell/asr1/conf/train_asr_transformer.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer_lr0.002.yaml
\ No newline at end of file
diff --git a/egs2/aishell/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/aishell/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..1e34e982978
--- /dev/null
+++ b/egs2/aishell/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,66 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 28
+
+# optimization related
+accum_grad: 2
+grad_clip: 5
+patience: 3
+max_epoch: 20
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+# NoamLR is deprecated. Use WarmupLR.
+# The following is equivalent setting for NoamLR:
+#
+#    optim: adam
+#    optim_conf:
+#        lr: 10.
+#    scheduler: noamlr
+#    scheduler_conf:
+#        model_size: 256
+#        warmup_steps: 25000
+#
+optim: adam
+optim_conf:
+    lr: 0.00395
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
diff --git a/egs2/aishell/asr1/conf/tuning/train_asr_transformer_lr0.002.yaml b/egs2/aishell/asr1/conf/tuning/train_asr_transformer_lr0.002.yaml
new file mode 100644
index 00000000000..7097638fcff
--- /dev/null
+++ b/egs2/aishell/asr1/conf/tuning/train_asr_transformer_lr0.002.yaml
@@ -0,0 +1,66 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+accum_grad: 2
+grad_clip: 5
+patience: 3
+max_epoch: 20
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+# NoamLR is deprecated. Use WarmupLR.
+# The following is equivalent setting for NoamLR:
+#
+#    optim: adam
+#    optim_conf:
+#        lr: 10.
+#    scheduler: noamlr
+#    scheduler_conf:
+#        model_size: 256
+#        warmup_steps: 25000
+#
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
diff --git a/egs2/aishell/asr1/run.sh b/egs2/aishell/asr1/run.sh
index d220da819b5..d76c7d35528 100755
--- a/egs2/aishell/asr1/run.sh
+++ b/egs2/aishell/asr1/run.sh
@@ -6,8 +6,8 @@ set -u
 set -o pipefail
 
 train_set=train
-dev_set=dev
-eval_sets="test "
+valid_set=dev
+test_sets="dev test"
 
 asr_config=conf/train_asr_rnn.yaml
 decode_config=conf/decode_asr_rnn.yaml
@@ -21,6 +21,7 @@ use_wordlm=false
 speed_perturb_factors="0.9 1.0 1.1"
 
 ./asr.sh                                               \
+    --lang zh                                          \
     --audio_format wav                                 \
     --feats_type fbank_pitch                           \
     --token_type char                                  \
@@ -30,7 +31,7 @@ speed_perturb_factors="0.9 1.0 1.1"
     --asr_config "${asr_config}"                       \
     --decode_config "${decode_config}"                 \
     --train_set "${train_set}"                         \
-    --dev_set "${dev_set}"                             \
-    --eval_sets "${eval_sets}"                         \
+    --valid_set "${valid_set}"                         \
+    --test_sets "${test_sets}"                         \
     --speed_perturb_factors "${speed_perturb_factors}" \
     --srctexts "data/${train_set}/text" "$@"
diff --git a/egs2/ami/asr1/run.sh b/egs2/ami/asr1/run.sh
index dc275939204..ea3be877e8f 100755
--- a/egs2/ami/asr1/run.sh
+++ b/egs2/ami/asr1/run.sh
@@ -17,8 +17,8 @@ set -o pipefail
 mic=ihm
 
 train_set=${mic}_train
-train_dev=${mic}_dev
-train_test=${mic}_eval
+valid_set=${mic}_dev
+test_sets="${mic}_eval ${mic}_dev"
 
 asr_config=conf/train_asr.yaml
 lm_config=conf/train_lm.yaml
@@ -27,6 +27,7 @@ decode_config=conf/decode_asr.yaml
 speed_perturb_factors="0.9 1.0 1.1"
 
 ./asr.sh \
+    --lang en \
     --local_data_opts "--mic ${mic}" \
     --use_lm true \
     --lm_config "${lm_config}" \
@@ -37,7 +38,7 @@ speed_perturb_factors="0.9 1.0 1.1"
     --asr_config "${asr_config}" \
     --decode_config "${decode_config}" \
     --train_set "${train_set}" \
-    --dev_set "${train_dev}" \
-    --eval_sets "${train_test}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
     --speed_perturb_factors "${speed_perturb_factors}" \
     --srctexts "data/${train_set}/text" "$@"
diff --git a/egs2/an4/asr1/run.sh b/egs2/an4/asr1/run.sh
index c580cf8210b..46836360449 100755
--- a/egs2/an4/asr1/run.sh
+++ b/egs2/an4/asr1/run.sh
@@ -6,8 +6,9 @@ set -u
 set -o pipefail
 
 ./asr.sh \
+    --lang en \
     --train_set train_nodev \
     --lm_config conf/train_lm.yaml \
-    --dev_set train_dev \
-    --eval_sets "test " \
+    --valid_set train_dev \
+    --test_sets "train_dev test" \
     --srctexts "data/train_nodev/text" "$@"
diff --git a/egs2/an4/tts1/run.sh b/egs2/an4/tts1/run.sh
index 70a2aed1357..22c55086862 100755
--- a/egs2/an4/tts1/run.sh
+++ b/egs2/an4/tts1/run.sh
@@ -6,7 +6,8 @@ set -u
 set -o pipefail
 
 ./tts.sh \
+    --lang en \
     --train_set train_nodev \
-    --dev_set train_dev \
-    --eval_sets "test " \
+    --valid_set train_dev \
+    --test_sets "train_dev test" \
     --srctexts "data/train_nodev/text" "$@"
diff --git a/egs2/babel/asr1/local/data.sh b/egs2/babel/asr1/local/data.sh
index e1c946b8366..343a9a68db7 100755
--- a/egs2/babel/asr1/local/data.sh
+++ b/egs2/babel/asr1/local/data.sh
@@ -35,7 +35,7 @@ train_dev=dev
 
 recog_set=""
 for l in ${recog}; do
-  recog_set="eval_${l} ${recog_set}"
+  recog_set="dev_${l} eval_${l} ${recog_set}"
 done
 recog_set=${recog_set%% }
 
diff --git a/egs2/babel/asr1/local/setup_languages.sh b/egs2/babel/asr1/local/setup_languages.sh
index 8f146d7b29d..9949834d77d 100755
--- a/egs2/babel/asr1/local/setup_languages.sh
+++ b/egs2/babel/asr1/local/setup_languages.sh
@@ -75,5 +75,6 @@ done
 
 for l in ${recog}; do
   ln -s ${cwd}/data/${l}/data/eval_${l} ${cwd}/data/eval_${l}
+  ln -s ${cwd}/data/${l}/data/dev_${l} ${cwd}/data/dev_${l}
 done
 
diff --git a/egs2/babel/asr1/run.sh b/egs2/babel/asr1/run.sh
index 7530d55a1b9..ddaf4bc1c60 100755
--- a/egs2/babel/asr1/run.sh
+++ b/egs2/babel/asr1/run.sh
@@ -6,16 +6,16 @@ set -u
 set -o pipefail
 
 train_set=train
-train_dev=dev
+valid_set=dev
 
 langs="101 102 103 104 105 106 202 203 204 205 206 207 301 302 303 304 305 306 401 402 403"
 recog="107 201 307 404"
 
-train_test=""
+test_sets=""
 for l in ${recog}; do
-  train_test="eval_${l} ${train_test}"
+  test_sets="dev_${l} eval_${l} ${test_sets}"
 done
-train_test=${train_test%% }
+test_sets=${test_sets%% }
 
 asr_config=conf/train_asr.yaml
 lm_config=conf/train_lm.yaml
@@ -23,7 +23,10 @@ decode_config=conf/decode_asr.yaml
 
 nlsyms_txt=data/nlsym.txt
 
+
+# TODO(kamo): Derive language name from $langs and give it as --lang
 ./asr.sh \
+    --lang noinfo \
     --local_data_opts "--langs ${langs} --recog ${recog}" \
     --use_lm true \
     --lm_config "${lm_config}" \
@@ -33,6 +36,6 @@ nlsyms_txt=data/nlsym.txt
     --asr_config "${asr_config}" \
     --decode_config "${decode_config}" \
     --train_set "${train_set}" \
-    --dev_set "${train_dev}" \
-    --eval_sets "${train_test}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
     --srctexts "data/${train_set}/text" "$@"
diff --git a/egs2/chime4/asr1/run.sh b/egs2/chime4/asr1/run.sh
index 3bdabb0f5b7..fedb79a439f 100755
--- a/egs2/chime4/asr1/run.sh
+++ b/egs2/chime4/asr1/run.sh
@@ -8,8 +8,8 @@ set -o pipefail
 
 
 train_set=tr05_multi_noisy_si284 # tr05_multi_noisy (original training data) or tr05_multi_noisy_si284 (add si284 data)
-dev_set=dt05_multi_isolated_1ch_track
-eval_set="\
+valid_set=dt05_multi_isolated_1ch_track
+test_sets="\
 dt05_real_isolated_1ch_track dt05_simu_isolated_1ch_track et05_real_isolated_1ch_track et05_simu_isolated_1ch_track \
 dt05_real_beamformit_2mics dt05_simu_beamformit_2mics et05_real_beamformit_2mics et05_simu_beamformit_2mics \
 dt05_real_beamformit_5mics dt05_simu_beamformit_5mics et05_real_beamformit_5mics et05_simu_beamformit_5mics \
@@ -24,6 +24,7 @@ use_word_lm=false
 word_vocab_size=65000
 
 ./asr.sh                                   \
+    --lang en \
     --nlsyms_txt data/nlsyms.txt           \
     --token_type char                      \
     --feats_type fbank_pitch               \
@@ -33,6 +34,6 @@ word_vocab_size=65000
     --use_word_lm ${use_word_lm}           \
     --word_vocab_size ${word_vocab_size}   \
     --train_set "${train_set}"             \
-    --dev_set "${dev_set}"                 \
-    --eval_sets "${eval_set}"              \
+    --valid_set "${valid_set}"             \
+    --test_sets "${test_sets}"             \
     --srctexts "data/${train_set}/text data/local/other_text/text" "$@"
diff --git a/egs2/commonvoice/asr1/run.sh b/egs2/commonvoice/asr1/run.sh
index 354b7046372..bbf871d3662 100755
--- a/egs2/commonvoice/asr1/run.sh
+++ b/egs2/commonvoice/asr1/run.sh
@@ -5,11 +5,12 @@ set -e
 set -u
 set -o pipefail
 
+
 lang=cy # en de fr cy tt kab ca zh-TW it fa eu es ru
 
 train_set=valid_train_${lang}
-train_dev=valid_dev_${lang}
-train_test=valid_test_${lang}
+valid_set=valid_dev_${lang}
+test_sets="valid_dev_${lang} valid_test_${lang}"
 
 asr_config=conf/train_asr.yaml
 lm_config=conf/train_lm.yaml
@@ -17,6 +18,7 @@ decode_config=conf/decode_asr.yaml
 
 
 ./asr.sh \
+    --lang "${lang}" \
     --local_data_opts "--lang ${lang}" \
     --use_lm true \
     --lm_config "${lm_config}" \
@@ -26,6 +28,6 @@ decode_config=conf/decode_asr.yaml
     --asr_config "${asr_config}" \
     --decode_config "${decode_config}" \
     --train_set "${train_set}" \
-    --dev_set "${train_dev}" \
-    --eval_sets "${train_test}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
     --srctexts "data/${train_set}/text" "$@"
diff --git a/egs2/csj/asr1/run.sh b/egs2/csj/asr1/run.sh
index 90740376fbc..2d995d9df02 100755
--- a/egs2/csj/asr1/run.sh
+++ b/egs2/csj/asr1/run.sh
@@ -6,8 +6,8 @@ set -u
 set -o pipefail
 
 train_set=train_nodup
-dev_set=train_dev
-eval_set="eval1 eval2 eval3"
+valid_set=train_dev
+test_sets="eval1 eval2 eval3"
 
 asr_config=conf/train_asr_rnn.yaml
 decode_config=conf/decode_asr.yaml
@@ -18,13 +18,14 @@ lm_config=conf/train_lm.yaml
 speed_perturb_factors="0.9 1.0 1.1"
 
 ./asr.sh \
+    --lang jp \
     --token_type char \
     --feats_type raw \
     --asr_config "${asr_config}" \
     --decode_config "${decode_config}" \
     --lm_config "${lm_config}" \
     --train_set "${train_set}" \
-    --dev_set "${dev_set}" \
-    --eval_sets "${eval_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
     --speed_perturb_factors "${speed_perturb_factors}" \
     --srctexts "data/train_nodev/text" "$@"
diff --git a/egs2/csmsc/tts1/run.sh b/egs2/csmsc/tts1/run.sh
index 17c307f6f90..7764a97a4d6 100755
--- a/egs2/csmsc/tts1/run.sh
+++ b/egs2/csmsc/tts1/run.sh
@@ -19,8 +19,8 @@ else
 fi
 
 train_set=tr_no_dev
-dev_set=dev
-eval_set=eval1
+valid_set=dev
+test_sets="dev eval1"
 
 train_config=conf/train.yaml
 decode_config=conf/decode.yaml
@@ -33,6 +33,7 @@ g2p=pypinyin_g2p_phone
 # pypinyin_g2p_phone: k a3 er3 p u3 p ei2 uai4 s un1 uan2 h ua2 t i1
 
 ./tts.sh \
+    --lang zh \
     --feats_type raw \
     --fs "${fs}" \
     --n_fft "${n_fft}" \
@@ -44,7 +45,7 @@ g2p=pypinyin_g2p_phone
     --train_config "${train_config}" \
     --decode_config "${decode_config}" \
     --train_set "${train_set}" \
-    --dev_set "${dev_set}" \
-    --eval_sets "${eval_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
     --srctexts "data/${train_set}/text" \
     ${opts} "$@"
diff --git a/egs2/dirha_wsj/asr1/run.sh b/egs2/dirha_wsj/asr1/run.sh
index c3b4a063f0d..8234c24db59 100755
--- a/egs2/dirha_wsj/asr1/run.sh
+++ b/egs2/dirha_wsj/asr1/run.sh
@@ -11,8 +11,8 @@ local_data_opts="--mic ${mic}"
 
 
 train_set=train_si284_$mic
-dev_set=dirha_sim_$mic
-eval_set=dirha_real_$mic
+valid_set=dirha_sim_$mic
+test_sets=dirha_real_$mic
 
 # config files
 #preprocess_config=conf/no_preprocess.yaml  # use conf/specaug.yaml for data augmentation
@@ -24,6 +24,7 @@ use_word_lm=false
 word_vocab_size=65000
 
 ./asr.sh                                        \
+    --lang en \
     --nlsyms_txt data/nlsyms.txt                \
     --token_type char                           \
     --feats_type fbank_pitch                    \
@@ -33,7 +34,7 @@ word_vocab_size=65000
     --use_word_lm ${use_word_lm}                \
     --word_vocab_size ${word_vocab_size}        \
     --train_set "${train_set}"                  \
-    --dev_set "${dev_set}"                      \
-    --eval_sets "${eval_set}"                   \
+    --valid_set "${valid_set}"                  \
+    --test_sets "${test_sets}"                  \
     --local_data_opts "${local_data_opts}"      \
     --srctexts "data/${train_set}/text data/local/other_text/text" "$@"
diff --git a/egs2/how2/asr1/run.sh b/egs2/how2/asr1/run.sh
index 86f73ff9937..0cd76b1735b 100755
--- a/egs2/how2/asr1/run.sh
+++ b/egs2/how2/asr1/run.sh
@@ -5,8 +5,8 @@ set -u
 set -o pipefail
 
 train_set="train_reduced"
-train_dev="dev5"
-eval_set="test_set_iwslt2019"
+valid_set="dev5"
+test_sets="dev5 test_set_iwslt2019"
 
 asr_config=conf/train_asr_rnn.yaml
 decode_config=conf/decode.yaml
@@ -23,6 +23,7 @@ bpe_nlsyms="[hes]"
 use_lm=false
 
 ./asr.sh                                        \
+    --lang en                                   \
     --feats_type ${feats_type}                  \
     --token_type ${token_type}                  \
     --nbpe ${nbpe}                              \
@@ -31,7 +32,7 @@ use_lm=false
     --use_lm ${use_lm}                          \
     --asr_config "${asr_config}"                \
     --decode_config "${decode_config}"          \
-    --dev_set "${train_dev}"                    \
     --train_set "${train_set}"                  \
-    --eval_sets "${eval_set}"                   \
+    --valid_set "${valid_set}"                  \
+    --test_sets "${test_sets}"                  \
     --srctexts "data/${train_set}/text" "$@"
diff --git a/egs2/jsut/asr1/run.sh b/egs2/jsut/asr1/run.sh
index df932ce5ed5..2574b4dc10b 100755
--- a/egs2/jsut/asr1/run.sh
+++ b/egs2/jsut/asr1/run.sh
@@ -15,13 +15,14 @@ else
 fi
 
 train_set=tr_no_dev
-dev_set=dev
-eval_set=eval1
+valid_set=dev
+test_sets="dev eval1"
 
 asr_config=conf/train_asr_rnn.yaml
 decode_config=conf/decode_rnn.yaml
 lm_config=conf/train_lm.yaml
 ./asr.sh \
+    --lang jp \
     --token_type char \
     --feats_type raw \
     --fs ${fs} \
@@ -30,7 +31,7 @@ lm_config=conf/train_lm.yaml
     --decode_config "${decode_config}" \
     --lm_config "${lm_config}" \
     --train_set "${train_set}" \
-    --dev_set "${dev_set}" \
-    --eval_sets "${eval_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
     --srctexts "data/${train_set}/text" \
     ${opts} "$@"
diff --git a/egs2/jsut/tts1/run.sh b/egs2/jsut/tts1/run.sh
index 253973764f6..934eb452e55 100755
--- a/egs2/jsut/tts1/run.sh
+++ b/egs2/jsut/tts1/run.sh
@@ -19,8 +19,8 @@ else
 fi
 
 train_set=tr_no_dev
-dev_set=dev
-eval_set=eval1
+valid_set=dev
+test_sets="dev eval1"
 
 train_config=conf/train.yaml
 decode_config=conf/decode.yaml
@@ -32,6 +32,7 @@ g2p=pyopenjtalk
 # toke_type=char doesn't indicate kana, but mean kanji-kana-majiri-moji characters
 
 ./tts.sh \
+    --lang jp \
     --feats_type raw \
     --fs "${fs}" \
     --n_fft "${n_fft}" \
@@ -43,7 +44,7 @@ g2p=pyopenjtalk
     --train_config "${train_config}" \
     --decode_config "${decode_config}" \
     --train_set "${train_set}" \
-    --dev_set "${dev_set}" \
-    --eval_sets "${eval_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
     --srctexts "data/${train_set}/text" \
     ${opts} "$@"
diff --git a/egs2/librispeech/asr1/README.md b/egs2/librispeech/asr1/README.md
new file mode 100644
index 00000000000..495629d9cb9
--- /dev/null
+++ b/egs2/librispeech/asr1/README.md
@@ -0,0 +1,20 @@
+# The first tentative result
+## Environments
+- date: `Fri Jul  3 04:48:06 EDT 2020`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.8.0`
+- pytorch version: `pytorch 1.4.0`
+- Git hash: `af9cb2449a15b89490964b413a9a02422a26fa5e`
+  - Commit date: `Thu Jul 2 07:56:03 2020 -0400`
+
+- beam size 20 (60 in the best system), ASR epoch ~60, LM epoch 2
+
+## asr_train_asr_transformer_raw_bpe_optim_conflr0.001
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_dev_clean_decode_asr_beam_size20_lm_train_lm_bpe_valid.loss.best_asr_model_valid.acc.best|2703|54402|97.2|2.4|0.3|0.3|3.1|35.2|
+|decode_dev_other_decode_asr_beam_size20_lm_train_lm_bpe_valid.loss.best_asr_model_valid.acc.best|2864|50948|93.2|6.0|0.8|0.8|7.6|54.7|
+|decode_test_clean_decode_asr_beam_size20_lm_train_lm_bpe_valid.loss.best_asr_model_valid.acc.best|2620|52576|97.0|2.6|0.4|0.4|3.4|37.4|
+|decode_test_other_decode_asr_beam_size20_lm_train_lm_bpe_valid.loss.best_asr_model_valid.acc.best|2939|52343|92.9|6.1|1.0|0.9|8.0|58.3|
diff --git a/egs2/librispeech/asr1/asr.sh b/egs2/librispeech/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/librispeech/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/librispeech/asr1/cmd.sh b/egs2/librispeech/asr1/cmd.sh
new file mode 100644
index 00000000000..e0c19d89ddb
--- /dev/null
+++ b/egs2/librispeech/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/librispeech/asr1/conf/decode_asr.yaml b/egs2/librispeech/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..fe5290e82d1
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/decode_asr.yaml
@@ -0,0 +1,3 @@
+lm_weight: 0.6
+ctc_weight: 0.4
+beam_size: 60
diff --git a/egs2/csmsc/tts1/conf/fbank.conf b/egs2/librispeech/asr1/conf/fbank.conf
similarity index 100%
rename from egs2/csmsc/tts1/conf/fbank.conf
rename to egs2/librispeech/asr1/conf/fbank.conf
diff --git a/egs2/librispeech/asr1/conf/pbs.conf b/egs2/librispeech/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/librispeech/asr1/conf/pitch.conf b/egs2/librispeech/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/librispeech/asr1/conf/queue.conf b/egs2/librispeech/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/librispeech/asr1/conf/slurm.conf b/egs2/librispeech/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..22becae80f6
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/librispeech/asr1/conf/train_asr_transformer.yaml b/egs2/librispeech/asr1/conf/train_asr_transformer.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/train_asr_transformer.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/librispeech/asr1/conf/train_lm.yaml b/egs2/librispeech/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..79f88da9f67
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 4
+    unit: 2048
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 200   # batch size in LM training
+max_epoch: 20     # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..842ea6f401b
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,62 @@
+batch_type: numel
+batch_bins: 16000000
+accum_grad: 4
+max_epoch: 200
+patience: none
+# The initialization method for model parameters
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: transformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/librispeech/asr1/db.sh b/egs2/librispeech/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/librispeech/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/librispeech/asr1/local/data.sh b/egs2/librispeech/asr1/local/data.sh
new file mode 100755
index 00000000000..24f328dafcc
--- /dev/null
+++ b/egs2/librispeech/asr1/local/data.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+data_url=www.openslr.org/resources/12
+train_set="train_960"
+train_dev="dev"
+
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${LIBRISPEECH}" ]; then
+    log "Fill the value of 'LIBRISPEECH' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ ! -e "${LIBRISPEECH}/LibriSpeech/LICENSE.TXT" ]; then
+	echo "stage 1: Data Download to ${LIBRISPEECH}"
+	for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
+            local/download_and_untar.sh ${LIBRISPEECH} ${data_url} ${part}
+	done
+    else
+        log "stage 1: ${LIBRISPEECH}/LibriSpeech/LICENSE.TXT is already existing. Skip data downloading"
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
+        # use underscore-separated names in data directories.
+        local/data_prep.sh ${LIBRISPEECH}/LibriSpeech/${part} data/${part//-/_}
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: combine all training and development sets"
+    utils/combine_data.sh --extra_files utt2num_frames data/${train_set} data/train_clean_100 data/train_clean_360 data/train_other_500
+    utils/combine_data.sh --extra_files utt2num_frames data/${train_dev} data/dev_clean data/dev_other
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # use external data
+    if [ ! -e data/local/other_text/librispeech-lm-norm.txt.gz ]; then
+	log "stage 4: prepare external text data from http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz"
+        wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/other_text/
+    fi
+    if [ ! -e data/local/other_text/text ]; then
+	# provide utterance id to each texts
+	# e.g., librispeech_lng_00003686 A BANK CHECK
+	zcat data/local/other_text/librispeech-lm-norm.txt.gz | \
+	    awk '{ printf("librispeech_lng_%08d %s\n",NR,$0) } ' > data/local/other_text/text
+    fi
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/librispeech/asr1/local/data_prep.sh b/egs2/librispeech/asr1/local/data_prep.sh
new file mode 100755
index 00000000000..422fa20f775
--- /dev/null
+++ b/egs2/librispeech/asr1/local/data_prep.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+# Copyright 2014  Vassil Panayotov
+#           2014  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 <src-dir> <dst-dir>"
+  echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
+  exit 1
+fi
+
+src=$1
+dst=$2
+
+# all utterances are FLAC compressed
+if ! which flac >&/dev/null; then
+   echo "Please install 'flac' on ALL worker nodes!"
+   exit 1
+fi
+
+spk_file=$src/../SPEAKERS.TXT
+
+mkdir -p $dst || exit 1
+
+[ ! -d $src ] && echo "$0: no such directory $src" && exit 1
+[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1
+
+
+wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
+trans=$dst/text; [[ -f "$trans" ]] && rm $trans
+utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
+spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
+
+for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
+  reader=$(basename $reader_dir)
+  if ! [ $reader -eq $reader ]; then  # not integer.
+    echo "$0: unexpected subdirectory name $reader"
+    exit 1
+  fi
+
+  reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}')
+  if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then
+    echo "Unexpected gender: '$reader_gender'"
+    exit 1
+  fi
+
+  for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
+    chapter=$(basename $chapter_dir)
+    if ! [ "$chapter" -eq "$chapter" ]; then
+      echo "$0: unexpected chapter-subdirectory name $chapter"
+      exit 1
+    fi
+
+    find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
+      awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac\n", $0, dir, $0}' >>$wav_scp|| exit 1
+
+    chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
+    [ ! -f  $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
+    cat $chapter_trans >>$trans
+
+    # NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered
+    #       to be a different speaker. This is done for simplicity and because we want
+    #       e.g. the CMVN to be calculated per-chapter
+    awk -v "reader=$reader" -v "chapter=$chapter" '{printf "%s %s-%s\n", $1, reader, chapter}' \
+      <$chapter_trans >>$utt2spk || exit 1
+
+    # reader -> gender map (again using per-chapter granularity)
+    echo "${reader}-${chapter} $reader_gender" >>$spk2gender
+  done
+done
+
+spk2utt=$dst/spk2utt
+utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1
+
+ntrans=$(wc -l <$trans)
+nutt2spk=$(wc -l <$utt2spk)
+! [ "$ntrans" -eq "$nutt2spk" ] && \
+  echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1
+
+utils/validate_data_dir.sh --no-feats $dst || exit 1
+
+echo "$0: successfully prepared data in $dst"
+
+exit 0
diff --git a/egs2/librispeech/asr1/local/download_and_untar.sh b/egs2/librispeech/asr1/local/download_and_untar.sh
new file mode 120000
index 00000000000..e7226b38904
--- /dev/null
+++ b/egs2/librispeech/asr1/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../../../egs/librispeech/asr1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/librispeech/asr1/path.sh b/egs2/librispeech/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/librispeech/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/librispeech/asr1/pyscripts b/egs2/librispeech/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/librispeech/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/librispeech/asr1/run.sh b/egs2/librispeech/asr1/run.sh
new file mode 100755
index 00000000000..acaa9e63eec
--- /dev/null
+++ b/egs2/librispeech/asr1/run.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train_960"
+valid_set="dev"
+test_sets="test_clean test_other dev_clean dev_other"
+
+asr_config=conf/train_asr_transformer.yaml
+lm_config=conf/train_lm.yaml
+decode_config=conf/decode_asr.yaml
+
+./asr.sh \
+    --lang en \
+    --ngpu 4 \
+    --nbpe 5000 \
+    --max_wav_duration 30 \
+    --asr_config "${asr_config}" \
+    --lm_config "${lm_config}" \
+    --decode_config "${decode_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text data/local/other_text/text" "$@"
diff --git a/egs2/librispeech/asr1/scripts b/egs2/librispeech/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/librispeech/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/librispeech/asr1/steps b/egs2/librispeech/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/librispeech/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/librispeech/asr1/utils b/egs2/librispeech/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/librispeech/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/ljspeech/tts1/conf/fbank.conf b/egs2/ljspeech/tts1/conf/fbank.conf
deleted file mode 100644
index 82ac7bd0dbc..00000000000
--- a/egs2/ljspeech/tts1/conf/fbank.conf
+++ /dev/null
@@ -1,2 +0,0 @@
---sample-frequency=16000 
---num-mel-bins=80
diff --git a/egs2/ljspeech/tts1/run.sh b/egs2/ljspeech/tts1/run.sh
index 7f0f7ba2197..d83116c77c4 100755
--- a/egs2/ljspeech/tts1/run.sh
+++ b/egs2/ljspeech/tts1/run.sh
@@ -18,8 +18,8 @@ else
 fi
 
 train_set=tr_no_dev
-dev_set=dev
-eval_set=eval1
+valid_set=dev
+test_sets="dev eval1"
 
 train_config=conf/train.yaml
 decode_config=conf/decode.yaml
@@ -28,6 +28,7 @@ decode_config=conf/decode.yaml
 g2p=g2p_en_no_space # Include no word separator
 
 ./tts.sh \
+    --lang en \
     --feats_type raw \
     --fs "${fs}" \
     --n_fft "${n_fft}" \
@@ -38,7 +39,7 @@ g2p=g2p_en_no_space # Include no word separator
     --train_config "${train_config}" \
     --decode_config "${decode_config}" \
     --train_set "${train_set}" \
-    --dev_set "${dev_set}" \
-    --eval_sets "${eval_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
     --srctexts "data/${train_set}/text" \
     ${opts} "$@"
diff --git a/egs2/mini_an4/asr1/run.sh b/egs2/mini_an4/asr1/run.sh
index d45ddfdd659..6972de7cdab 100755
--- a/egs2/mini_an4/asr1/run.sh
+++ b/egs2/mini_an4/asr1/run.sh
@@ -6,7 +6,8 @@ set -u
 set -o pipefail
 
 ./asr.sh \
+    --lang en \
     --train_set train_nodev \
-    --dev_set train_dev \
-    --eval_sets "test test_seg" \
+    --valid_set train_dev \
+    --test_sets "train_dev test test_seg" \
     --srctexts "data/train_nodev/text" "$@"
diff --git a/egs2/mini_an4/tts1/run.sh b/egs2/mini_an4/tts1/run.sh
index 42403d81752..3113b5ba60f 100755
--- a/egs2/mini_an4/tts1/run.sh
+++ b/egs2/mini_an4/tts1/run.sh
@@ -6,7 +6,8 @@ set -u
 set -o pipefail
 
 ./tts.sh \
+    --lang en \
     --train_set train_nodev \
-    --dev_set train_dev \
-    --eval_sets "test test_seg" \
+    --valid_set train_dev \
+    --test_sets "train_dev test test_seg" \
     --srctexts "data/train_nodev/text" "$@"
diff --git a/egs2/vctk/tts1/cmd.sh b/egs2/vctk/tts1/cmd.sh
new file mode 100644
index 00000000000..e0c19d89ddb
--- /dev/null
+++ b/egs2/vctk/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/vctk/tts1/conf/decode.yaml b/egs2/vctk/tts1/conf/decode.yaml
new file mode 120000
index 00000000000..363baa875b7
--- /dev/null
+++ b/egs2/vctk/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/vctk/tts1/conf/pbs.conf b/egs2/vctk/tts1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/vctk/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/vctk/tts1/conf/queue.conf b/egs2/vctk/tts1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/vctk/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/vctk/tts1/conf/slurm.conf b/egs2/vctk/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..22becae80f6
--- /dev/null
+++ b/egs2/vctk/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/vctk/tts1/conf/train.yaml b/egs2/vctk/tts1/conf/train.yaml
new file mode 120000
index 00000000000..d6588046df3
--- /dev/null
+++ b/egs2/vctk/tts1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_gst_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/vctk/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/vctk/tts1/conf/tuning/decode_tacotron2.yaml
new file mode 100644
index 00000000000..ed39fb610e8
--- /dev/null
+++ b/egs2/vctk/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,15 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5            # threshold to stop the generation
+maxlenratio: 10.0         # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0          # minimum length of generated samples = input length * minlenratio
+use_att_constraint: false # Whether to use attention constraint, which is introduced in Deep Voice 3
+backward_window: 1        # Backward window size in the attention constraint
+forward_window: 3         # Forward window size in the attention constraint
diff --git a/egs2/vctk/tts1/conf/tuning/train_gst_tacotron2.yaml b/egs2/vctk/tts1/conf/tuning/train_gst_tacotron2.yaml
new file mode 100644
index 00000000000..8501337af48
--- /dev/null
+++ b/egs2/vctk/tts1/conf/tuning/train_gst_tacotron2.yaml
@@ -0,0 +1,79 @@
+# This configuration is for ESPnet2 to train GST-Tacotron 2. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It takes around
+# 2 days to finish the training on RTX Titan.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    use_gst: true                # whether to use GST embedding
+    gst_heads: 8                 # number of heads in GST multi-head attention
+    gst_tokens: 128              # number of global style tokens
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    spk_embed_dim: null          # speaker embedding dimension
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 10.0         # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/vctk/tts1/db.sh b/egs2/vctk/tts1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/vctk/tts1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/vctk/tts1/local/data.sh b/egs2/vctk/tts1/local/data.sh
new file mode 100755
index 00000000000..d705c8001b1
--- /dev/null
+++ b/egs2/vctk/tts1/local/data.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=-1
+stop_stage=2
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+if [ -z "${VCTK}" ]; then
+   log "Fill the value of 'VCTK' of db.sh"
+   exit 1
+fi
+db_root=${VCTK}
+
+train_set=tr_no_dev
+dev_set=dev
+eval_set=eval1
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    log "stage -1: Data Download"
+    local/data_download.sh "${db_root}"
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: local/data_prep.sh"
+    # Initial normalization of the data
+    # Doesn't change sampling frequency and it's done after stages
+    local/data_prep.sh \
+        --train_set "${train_set}" \
+        --dev_set "${dev_set}" \
+        --eval_set "${eval_set}" \
+        "${db_root}"/VCTK-Corpus
+fi
diff --git a/egs2/vctk/tts1/local/data_download.sh b/egs2/vctk/tts1/local/data_download.sh
new file mode 100755
index 00000000000..791bfb23731
--- /dev/null
+++ b/egs2/vctk/tts1/local/data_download.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Copyright 2020 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+db_root=$1
+
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <db_root>"
+    exit 1
+fi
+
+set -euo pipefail
+
+cwd=$(pwd)
+if [ ! -e "${db_root}/VCTK-Corpus" ]; then
+    mkdir -p "${db_root}"
+    cd "${db_root}" || exit 1;
+    wget http://www.udialogue.org/download/VCTK-Corpus.tar.gz
+    tar xvzf ./VCTK-Corpus.tar.gz
+    rm ./VCTK-Corpus.tar.gz
+    cd "${cwd}" || exit 1;
+    echo "Successfully downloaded data."
+else
+    echo "Already exists. Skipped."
+fi
+
+if [ ! -e "${db_root}/VCTK-Corpus/lab" ]; then
+    cd "${db_root}" || exit 1;
+    git clone https://github.com/kan-bayashi/VCTKCorpusFullContextLabel.git
+    cp -r VCTKCorpusFullContextLabel/lab ./VCTK-Corpus
+    cd "${cwd}" || exit 1;
+    echo "Successfully downloaded label data."
+else
+    echo "Already exists. Skipped."
+fi
diff --git a/egs2/vctk/tts1/local/data_prep.sh b/egs2/vctk/tts1/local/data_prep.sh
new file mode 100755
index 00000000000..560f230c117
--- /dev/null
+++ b/egs2/vctk/tts1/local/data_prep.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+
+# Copyright 2020 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+num_dev=5
+num_eval=5
+train_set="tr_no_dev"
+dev_set="dev"
+eval_set="eval1"
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+db=$1
+
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 [Options] <db>"
+    echo "e.g.: $0 downloads/VCTK-Corpus"
+    echo ""
+    echo "Options:"
+    echo "    --num_dev: number of development uttreances (default=${num_dev})."
+    echo "    --num_eval: number of evaluation uttreances (default=${num_eval})."
+    echo "    --train_set: name of train set (default=${train_set})."
+    echo "    --dev_set: name of dev set (default=${dev_set})."
+    echo "    --eval_set: name of eval set (default=${eval_set})."
+    exit 1
+fi
+
+set -euo pipefail
+
+# NOTE(kan-bayashi): p315 will not be used since it lacks txt data
+spks=$(find "${db}/wav48" -maxdepth 1 -name "p*" -exec basename {} \; | sort | grep -v p315)
+train_data_dirs=""
+dev_data_dirs=""
+eval_data_dirs=""
+for spk in ${spks}; do
+    # check spk existence
+    [ ! -e "${db}/lab/mono/${spk}" ] && \
+        echo "${spk} does not exist." >&2 && exit 1;
+
+    [ ! -e data/${spk}_train ] && mkdir -p data/${spk}_train
+
+    # set filenames
+    scp=data/${spk}_train/wav.scp
+    utt2spk=data/${spk}_train/utt2spk
+    text=data/${spk}_train/text
+    segments=data/${spk}_train/segments
+    spk2utt=data/${spk}_train/spk2utt
+
+    # check file existence
+    [ -e "${scp}" ] && rm "${scp}"
+    [ -e "${utt2spk}" ] && rm "${utt2spk}"
+    [ -e "${text}" ] && rm "${text}"
+    [ -e "${segments}" ] && rm "${segments}"
+
+    # make scp, text, and segments
+    find "${db}/wav48/${spk}" -follow -name "*.wav" | sort | while read -r wav; do
+        id=$(basename "${wav}" | sed -e "s/\.[^\.]*$//g")
+        lab=${db}/lab/mono/${spk}/${id}.lab
+        txt=${db}/txt/${spk}/${id}.txt
+
+        # check lab existence
+        if [ ! -e "${lab}" ]; then
+            echo "${id} does not have a label file. skipped."
+            continue
+        fi
+        if [ ! -e "${txt}" ]; then
+            echo "${id} does not have a text file. skipped."
+            continue
+        fi
+
+        echo "${id} ${wav}" >> "${scp}"
+        echo "${id} ${spk}" >> "${utt2spk}"
+        echo "${id} $(cat ${txt})" >> "${text}"
+
+        utils/utt2spk_to_spk2utt.pl "${utt2spk}" > "${spk2utt}"
+
+        # parse start and end time from HTS-style mono label
+        idx=1
+        while true; do
+            next_idx=$((idx+1))
+            next_symbol=$(sed -n "${next_idx}p" "${lab}" | awk '{print $3}')
+            if [ "${next_symbol}" != "pau" ]; then
+                start_nsec=$(sed -n "${idx}p" "${lab}" | awk '{print $2}')
+                break
+            fi
+            idx=${next_idx}
+        done
+        idx=$(wc -l < "${lab}")
+        while true; do
+            prev_idx=$((idx-1))
+            prev_symbol=$(sed -n "${prev_idx}p" "${lab}" | awk '{print $3}')
+            if [ "${prev_symbol}" != "pau" ]; then
+                end_nsec=$(sed -n "${idx}p" "${lab}" | awk '{print $1}')
+                break
+            fi
+            idx=${prev_idx}
+        done
+        start_sec=$(echo "${start_nsec}*0.0000001" | bc | sed "s/^\./0./")
+        end_sec=$(echo "${end_nsec}*0.0000001" | bc | sed "s/^\./0./")
+        echo "${id} ${id} ${start_sec} ${end_sec}" >> "${segments}"
+    done
+
+    # split
+    num_all=$(wc -l < "${scp}")
+    num_deveval=$((num_dev + num_eval))
+    num_train=$((num_all - num_deveval))
+    utils/subset_data_dir.sh --last "data/${spk}_train" "${num_deveval}" "data/${spk}_deveval"
+    utils/subset_data_dir.sh --first "data/${spk}_deveval" "${num_dev}" "data/${spk}_${eval_set}"
+    utils/subset_data_dir.sh --last "data/${spk}_deveval" "${num_eval}" "data/${spk}_${dev_set}"
+    utils/subset_data_dir.sh --first "data/${spk}_train" "${num_train}" "data/${spk}_${train_set}"
+
+    # remove tmp directories
+    rm -rf "data/${spk}_train"
+    rm -rf "data/${spk}_deveval"
+
+    train_data_dirs+=" data/${spk}_${train_set}"
+    dev_data_dirs+=" data/${spk}_${dev_set}"
+    eval_data_dirs+=" data/${spk}_${eval_set}"
+done
+
+utils/combine_data.sh data/${train_set} ${train_data_dirs}
+utils/combine_data.sh data/${dev_set} ${dev_data_dirs}
+utils/combine_data.sh data/${eval_set} ${eval_data_dirs}
+
+# remove tmp directories
+rm -rf data/p[0-9]*
+
+echo "Successfully prepared data."
diff --git a/egs2/vctk/tts1/path.sh b/egs2/vctk/tts1/path.sh
new file mode 120000
index 00000000000..524dc193272
--- /dev/null
+++ b/egs2/vctk/tts1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/path.sh
\ No newline at end of file
diff --git a/egs2/vctk/tts1/pyscripts b/egs2/vctk/tts1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/vctk/tts1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/vctk/tts1/run.sh b/egs2/vctk/tts1/run.sh
new file mode 100755
index 00000000000..91765ee283e
--- /dev/null
+++ b/egs2/vctk/tts1/run.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+fs=24000
+n_fft=2048
+n_shift=300
+win_length=1200
+
+opts=
+if [ "${fs}" -eq 48000 ]; then
+    # To suppress recreation, specify wav format
+    opts="--audio_format wav "
+else
+    opts="--audio_format flac "
+fi
+
+train_set=tr_no_dev
+valid_set=dev
+test_sets="dev eval1"
+
+train_config=conf/train.yaml
+decode_config=conf/decode.yaml
+
+# g2p=g2p_en # Include word separator
+g2p=g2p_en_no_space # Include no word separator
+
+./tts.sh \
+    --lang en \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type phn \
+    --cleaner tacotron \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --decode_config "${decode_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    ${opts} "$@"
diff --git a/egs2/vctk/tts1/scripts b/egs2/vctk/tts1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/vctk/tts1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/vctk/tts1/steps b/egs2/vctk/tts1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/vctk/tts1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/vctk/tts1/tts.sh b/egs2/vctk/tts1/tts.sh
new file mode 120000
index 00000000000..8f6f0cf1d99
--- /dev/null
+++ b/egs2/vctk/tts1/tts.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/vctk/tts1/utils b/egs2/vctk/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/vctk/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/vivos/asr1/run.sh b/egs2/vivos/asr1/run.sh
index 16691962e59..8df46ca13e2 100755
--- a/egs2/vivos/asr1/run.sh
+++ b/egs2/vivos/asr1/run.sh
@@ -5,8 +5,8 @@ set -u
 set -o pipefail
 
 train_set="train_nodev"
-train_dev="train_dev"
-eval_set="test"
+valid_set="train_dev"
+test_sets="train_dev test"
 
 asr_config=conf/train_asr.yaml
 decode_config=conf/decode.yaml
@@ -17,6 +17,7 @@ use_wordlm=false
 word_vocab_size=7184
 
 ./asr.sh                                        \
+    --lang vi                                   \
     --audio_format wav                          \
     --feats_type raw                            \
     --token_type char                           \
@@ -27,6 +28,6 @@ word_vocab_size=7184
     --asr_config "${asr_config}"                \
     --decode_config "${decode_config}"          \
     --train_set "${train_set}"                  \
-    --dev_set "${train_dev}"                    \
-    --eval_sets "${eval_set}"                   \
+    --valid_set "${valid_set}"                  \
+    --test_sets "${test_sets}"                  \
     --srctexts "data/${train_set}/text" "$@"
diff --git a/egs2/voxforge/asr1/local/data.sh b/egs2/voxforge/asr1/local/data.sh
index a41488f6c5d..3958da1e01b 100755
--- a/egs2/voxforge/asr1/local/data.sh
+++ b/egs2/voxforge/asr1/local/data.sh
@@ -40,11 +40,11 @@ fi
 
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    if [ ! -e "${VOXFORGE}/it/extracted" ]; then
+    if [ ! -e "${VOXFORGE}/${lang}/extracted" ]; then
         log "stage 1: Download data to ${VOXFORGE}"
         local/getdata.sh "${lang}" "${VOXFORGE}"
     else
-        log "stage 1: ${VOXFORGE}/it/extracted is already existing. Skip data downloading"
+        log "stage 1: ${VOXFORGE}/${lang}/extracted is already existing. Skip data downloading"
     fi
 fi
 
diff --git a/egs2/voxforge/asr1/run.sh b/egs2/voxforge/asr1/run.sh
index 88ec496473b..79ea3289c29 100755
--- a/egs2/voxforge/asr1/run.sh
+++ b/egs2/voxforge/asr1/run.sh
@@ -7,8 +7,8 @@ set -o pipefail
 
 lang=it # de, en, es, fr, it, nl, pt, ru
 train_set="tr_${lang}"
-dev_set="dt_${lang}"
-eval_sets="et_${lang}"
+valid_set="dt_${lang}"
+test_sets="dt_${lang} et_${lang}"
 
 asr_config=conf/train_asr_rnn.yaml
 decode_config=conf/decode_asr.yaml
@@ -18,6 +18,7 @@ decode_config=conf/decode_asr.yaml
 # I'm not sure this is due to bug.
 
 ./asr.sh \
+    --lang "${lang}" \
     --local_data_opts "--lang ${lang}" \
     --use_lm false \
     --token_type char \
@@ -26,6 +27,6 @@ decode_config=conf/decode_asr.yaml
     --asr_config "${asr_config}" \
     --decode_config "${decode_config}" \
     --train_set "${train_set}" \
-    --dev_set "${dev_set}" \
-    --eval_sets "${eval_sets}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
     --srctexts "data/${train_set}/text" "$@"
diff --git a/egs2/wsj/asr1/conf/tuning/train_lm.yaml b/egs2/wsj/asr1/conf/tuning/train_lm.yaml
index 3f90e47ba1f..2a3f7b02fcd 100644
--- a/egs2/wsj/asr1/conf/tuning/train_lm.yaml
+++ b/egs2/wsj/asr1/conf/tuning/train_lm.yaml
@@ -10,6 +10,6 @@ lm_conf:
     nlayers: 2
 best_model_criterion:
 -   - valid
-    - acc
-    - max
+    - loss
+    - min
 keep_nbest_models: 1
diff --git a/egs2/wsj/asr1/run.sh b/egs2/wsj/asr1/run.sh
index c82e83856a2..9a64964404d 100755
--- a/egs2/wsj/asr1/run.sh
+++ b/egs2/wsj/asr1/run.sh
@@ -6,16 +6,17 @@ set -u
 set -o pipefail
 
 train_set=train_si284
-dev_set=test_dev93
-eval_sets="test_eval92 "
+valid_set=test_dev93
+test_sets="test_dev93 test_eval92"
 
 ./asr.sh \
+    --lang "en" \
     --nbpe 5000 \
     --nlsyms_txt data/nlsyms.txt \
     --token_type char \
     --lm_config conf/train_lm.yaml \
     --asr_config conf/train_asr_transformer.yaml \
     --train_set "${train_set}" \
-    --dev_set "${dev_set}" \
-    --eval_sets "${eval_sets}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
     --srctexts "data/train_si284/text data/local/other_text/text" "$@"
diff --git a/egs2/yesno/asr1/run.sh b/egs2/yesno/asr1/run.sh
index 08d9b71ec6a..9129c14a0ed 100755
--- a/egs2/yesno/asr1/run.sh
+++ b/egs2/yesno/asr1/run.sh
@@ -5,13 +5,14 @@ set -u
 set -o pipefail
 
 train_set="train_nodev"
-train_dev="train_dev"
-eval_set="test_yesno"
+valid_set="train_dev"
+test_sets="train_dev test_yesno"
 
 asr_config=conf/train_asr.yaml
 decode_config=conf/decode.yaml
 
 ./asr.sh                                        \
+    --lang en                                   \
     --audio_format wav                          \
     --feats_type raw                            \
     --token_type char                           \
@@ -19,6 +20,6 @@ decode_config=conf/decode.yaml
     --asr_config "${asr_config}"                \
     --decode_config "${decode_config}"          \
     --train_set "${train_set}"                  \
-    --dev_set "${train_dev}"                    \
-    --eval_sets "${eval_set}"                   \
+    --valid_set "${valid_set}"                  \
+    --test_sets "${test_sets}"                  \
     --srctexts "data/${train_set}/text" "$@"
diff --git a/espnet/mt/pytorch_backend/mt.py b/espnet/mt/pytorch_backend/mt.py
index 98777c995d2..db52a766382 100644
--- a/espnet/mt/pytorch_backend/mt.py
+++ b/espnet/mt/pytorch_backend/mt.py
@@ -125,12 +125,7 @@ def train(args):
     if args.rnnlm is not None:
         rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
         rnnlm = lm_pytorch.ClassifierWithState(
-            lm_pytorch.RNNLM(
-                len(args.char_list),
-                rnnlm_args.layer,
-                rnnlm_args.unit,
-                getattr(rnnlm_args, "embed_unit", None),  # for backward compatibility
-            )
+            lm_pytorch.RNNLM(len(args.char_list), rnnlm_args.layer, rnnlm_args.unit)
         )
         torch_load(args.rnnlm, rnnlm)
         model.rnnlm = rnnlm
@@ -266,30 +261,26 @@ def train(args):
     # actual bathsize is included in a list
     # default collate function converts numpy array to pytorch tensor
     # we used an empty collate function instead which returns list
-    train_iter = {
-        "main": ChainerDataLoader(
-            dataset=TransformDataset(train, lambda data: converter([load_tr(data)])),
-            batch_size=1,
-            num_workers=args.n_iter_processes,
-            shuffle=not use_sortagrad,
-            collate_fn=lambda x: x[0],
-        )
-    }
-    valid_iter = {
-        "main": ChainerDataLoader(
-            dataset=TransformDataset(valid, lambda data: converter([load_cv(data)])),
-            batch_size=1,
-            shuffle=False,
-            collate_fn=lambda x: x[0],
-            num_workers=args.n_iter_processes,
-        )
-    }
+    train_iter = ChainerDataLoader(
+        dataset=TransformDataset(train, lambda data: converter([load_tr(data)])),
+        batch_size=1,
+        num_workers=args.n_iter_processes,
+        shuffle=not use_sortagrad,
+        collate_fn=lambda x: x[0],
+    )
+    valid_iter = ChainerDataLoader(
+        dataset=TransformDataset(valid, lambda data: converter([load_cv(data)])),
+        batch_size=1,
+        shuffle=False,
+        collate_fn=lambda x: x[0],
+        num_workers=args.n_iter_processes,
+    )
 
     # Set up a trainer
     updater = CustomUpdater(
         model,
         args.grad_clip,
-        train_iter,
+        {"main": train_iter},
         optimizer,
         device,
         args.ngpu,
@@ -313,11 +304,13 @@ def train(args):
     # Evaluate the model with the test dataset for each epoch
     if args.save_interval_iters > 0:
         trainer.extend(
-            CustomEvaluator(model, valid_iter, reporter, device, args.ngpu),
+            CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu),
             trigger=(args.save_interval_iters, "iteration"),
         )
     else:
-        trainer.extend(CustomEvaluator(model, valid_iter, reporter, device, args.ngpu))
+        trainer.extend(
+            CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu)
+        )
 
     # Save attention weight each epoch
     if args.num_save_attention > 0:
diff --git a/espnet/nets/beam_search.py b/espnet/nets/beam_search.py
index 4f216f8459c..2e0074a6b9e 100644
--- a/espnet/nets/beam_search.py
+++ b/espnet/nets/beam_search.py
@@ -7,6 +7,7 @@
 from typing import List
 from typing import NamedTuple
 from typing import Tuple
+from typing import Union
 
 import torch
 
@@ -19,9 +20,9 @@ class Hypothesis(NamedTuple):
     """Hypothesis data type."""
 
     yseq: torch.Tensor
-    score: float = 0
-    scores: Dict[str, float] = dict()
-    states: Dict[str, Dict] = dict()
+    score: Union[float, torch.Tensor] = 0
+    scores: Dict[str, Union[float, torch.Tensor]] = dict()
+    states: Dict[str, Any] = dict()
 
     def asdict(self) -> dict:
         """Convert data to JSON-friendly dict."""
diff --git a/espnet/nets/pytorch_backend/conformer/convolution.py b/espnet/nets/pytorch_backend/conformer/convolution.py
new file mode 100644
index 00000000000..c6c36821d6d
--- /dev/null
+++ b/espnet/nets/pytorch_backend/conformer/convolution.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#                Northwestern Polytechnical University (Pengcheng Guo)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""ConvolutionModule definition."""
+
+import torch
+from torch import nn
+
+
+class ConvolutionModule(nn.Module):
+    """ConvolutionModule in Conformer model.
+
+    :param int channels: channels of cnn
+    :param int kernel_size: kernerl size of cnn
+
+    """
+
+    def __init__(self, channels, kernel_size, bias=True):
+        """Construct an ConvolutionModule object."""
+        super(ConvolutionModule, self).__init__()
+        # kernerl_size should be a odd number for 'SAME' padding
+        assert (kernel_size - 1) % 2 == 0
+
+        self.pointwise_conv1 = nn.Conv1d(
+            channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=bias,
+        )
+        self.depthwise_conv = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+            groups=channels,
+            bias=bias,
+        )
+        self.norm = nn.BatchNorm1d(channels)
+        self.pointwise_conv2 = nn.Conv1d(
+            channels, channels, kernel_size=1, stride=1, padding=0, bias=bias,
+        )
+        self.activation = Swish()
+
+    def forward(self, x):
+        """Compute convolution module.
+
+        :param torch.Tensor x: (batch, time, size)
+        :return torch.Tensor: convoluted `value` (batch, time, d_model)
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose(1, 2)
+
+        # GLU mechanism
+        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
+        x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
+
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        x = self.activation(self.norm(x))
+
+        x = self.pointwise_conv2(x)
+
+        return x.transpose(1, 2)
+
+
+class Swish(nn.Module):
+    """Construct an Swish function object."""
+
+    def forward(self, x):
+        """Return an Swich activation function."""
+        return x * torch.sigmoid(x)
diff --git a/espnet/nets/pytorch_backend/conformer/encoder.py b/espnet/nets/pytorch_backend/conformer/encoder.py
new file mode 100644
index 00000000000..5210e8fb29c
--- /dev/null
+++ b/espnet/nets/pytorch_backend/conformer/encoder.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Encoder definition."""
+
+import logging
+import torch
+
+from espnet.nets.pytorch_backend.conformer.convolution import ConvolutionModule
+from espnet.nets.pytorch_backend.conformer.encoder_layer import EncoderLayer
+from espnet.nets.pytorch_backend.conformer.subsampling import Conv2dSubsampling
+from espnet.nets.pytorch_backend.transducer.vgg import VGG2L
+from espnet.nets.pytorch_backend.transformer.attention import (
+    MultiHeadedAttention,  # noqa: H301
+    RelPositionMultiHeadedAttention,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.embedding import (
+    PositionalEncoding,  # noqa: H301
+    ScaledPositionalEncoding,  # noqa: H301
+    RelPositionalEncoding,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.pytorch_backend.transformer.multi_layer_conv import Conv1dLinear
+from espnet.nets.pytorch_backend.transformer.multi_layer_conv import MultiLayeredConv1d
+from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
+    PositionwiseFeedForward,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.repeat import repeat
+
+
+class Encoder(torch.nn.Module):
+    """Conformer encoder module.
+
+    :param int idim: input dim
+    :param int attention_dim: dimention of attention
+    :param int attention_heads: the number of heads of multi head attention
+    :param int linear_units: the number of units of position-wise feed forward
+    :param int num_blocks: the number of decoder blocks
+    :param float dropout_rate: dropout rate
+    :param float attention_dropout_rate: dropout rate in attention
+    :param float positional_dropout_rate: dropout rate after adding positional encoding
+    :param str or torch.nn.Module input_layer: input layer type
+    :param bool normalize_before: whether to use layer_norm before the first block
+    :param bool concat_after: whether to concat attention layer's input and output
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    :param str positionwise_layer_type: linear of conv1d
+    :param int positionwise_conv_kernel_size: kernel size of positionwise conv1d layer
+    :param str encoder_pos_enc_layer_type: encoder positional encoding layer type
+    :param str encoder_attn_layer_type: encoder attention layer type
+    :param bool macaron_style: whether to use macaron style for positionwise layer
+    :param bool use_cnn_module: whether to use convolution module
+    :param int cnn_module_kernel: kernerl size of convolution module
+    :param int padding_idx: padding_idx for input_layer=embed
+    """
+
+    def __init__(
+        self,
+        idim,
+        attention_dim=256,
+        attention_heads=4,
+        linear_units=2048,
+        num_blocks=6,
+        dropout_rate=0.1,
+        positional_dropout_rate=0.1,
+        attention_dropout_rate=0.0,
+        input_layer="conv2d",
+        normalize_before=True,
+        concat_after=False,
+        positionwise_layer_type="linear",
+        positionwise_conv_kernel_size=1,
+        macaron_style=False,
+        pos_enc_layer_type="abs_pos",
+        selfattention_layer_type="selfattn",
+        use_cnn_module=False,
+        cnn_module_kernel=31,
+        padding_idx=-1,
+    ):
+        """Construct an Encoder object."""
+        super(Encoder, self).__init__()
+
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert selfattention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+
+        if input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(idim, attention_dim),
+                torch.nn.LayerNorm(attention_dim),
+                torch.nn.Dropout(dropout_rate),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(
+                idim,
+                attention_dim,
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif input_layer == "vgg2l":
+            self.embed = VGG2L(idim, attention_dim)
+        elif input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif isinstance(input_layer, torch.nn.Module):
+            self.embed = torch.nn.Sequential(
+                input_layer, pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif input_layer is None:
+            self.embed = torch.nn.Sequential(
+                pos_enc_class(attention_dim, positional_dropout_rate)
+            )
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+        self.normalize_before = normalize_before
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (attention_dim, linear_units, dropout_rate)
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (
+                attention_dim,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (
+                attention_dim,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+
+        if selfattention_layer_type == "selfattn":
+            logging.info("encoder self-attention layer type = self-attention")
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                attention_dim,
+                attention_dropout_rate,
+            )
+        elif selfattention_layer_type == "rel_selfattn":
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                attention_dim,
+                attention_dropout_rate,
+            )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type)
+
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (attention_dim, cnn_module_kernel)
+
+        self.encoders = repeat(
+            num_blocks,
+            lambda lnum: EncoderLayer(
+                attention_dim,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ),
+        )
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+
+    def forward(self, xs, masks):
+        """Encode input sequence.
+
+        :param torch.Tensor xs: input tensor
+        :param torch.Tensor masks: input mask
+        :return: position embedded tensor and mask
+        :rtype Tuple[torch.Tensor, torch.Tensor]:
+        """
+        if isinstance(self.embed, (Conv2dSubsampling, VGG2L)):
+            xs, masks = self.embed(xs, masks)
+        else:
+            xs = self.embed(xs)
+
+        xs, masks = self.encoders(xs, masks)
+        if isinstance(xs, tuple):
+            xs = xs[0]
+
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks
diff --git a/espnet/nets/pytorch_backend/conformer/encoder_layer.py b/espnet/nets/pytorch_backend/conformer/encoder_layer.py
new file mode 100644
index 00000000000..a7082c798ba
--- /dev/null
+++ b/espnet/nets/pytorch_backend/conformer/encoder_layer.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Encoder self-attention layer definition."""
+
+import torch
+
+from torch import nn
+
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+
+
+class EncoderLayer(nn.Module):
+    """Encoder layer module.
+
+    :param int size: input dim
+    :param espnet.nets.pytorch_backend.transformer.attention.
+        MultiHeadedAttention self_attn: self attention module
+        RelPositionMultiHeadedAttention self_attn: self attention module
+    :param espnet.nets.pytorch_backend.transformer.positionwise_feed_forward.
+        PositionwiseFeedForward feed_forward:
+        feed forward module
+    :param espnet.nets.pytorch_backend.transformer.positionwise_feed_forward
+    for macaron style
+    PositionwiseFeedForward feed_forward:
+    feed forward module
+    :param espnet.nets.pytorch_backend.conformer.convolution.
+        ConvolutionModule feed_foreard:
+        feed forward module
+    :param float dropout_rate: dropout rate
+    :param bool normalize_before: whether to use layer_norm before the first block
+    :param bool concat_after: whether to concat attention layer's input and output
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+
+    """
+
+    def __init__(
+        self,
+        size,
+        self_attn,
+        feed_forward,
+        feed_forward_macaron,
+        conv_module,
+        dropout_rate,
+        ff_scale=1.0,
+        normalize_before=True,
+        concat_after=False,
+    ):
+        """Construct an EncoderLayer object."""
+        super(EncoderLayer, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.ff_scale = ff_scale
+        self.norm_ff = LayerNorm(size)  # for the FNN module
+        self.norm_mha = LayerNorm(size)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = LayerNorm(size)
+            self.ff_scale = 0.5
+        if self.conv_module is not None:
+            self.norm_conv = LayerNorm(size)  # for the CNN module
+            self.norm_final = LayerNorm(size)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+
+    def forward(self, x_input, mask, cache=None):
+        """Compute encoded features.
+
+        :param torch.Tensor x_input: encoded source features, w/o pos_emb
+        tuple((batch, max_time_in, size), (1, max_time_in, size))
+        or (batch, max_time_in, size)
+        :param torch.Tensor mask: mask for x (batch, max_time_in)
+        :param torch.Tensor cache: cache for x (batch, max_time_in - 1, size)
+        :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        if isinstance(x_input, tuple):
+            x, pos_emb = x_input[0], x_input[1]
+        else:
+            x, pos_emb = x_input, None
+
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+
+        if cache is None:
+            x_q = x
+        else:
+            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
+            x_q = x[:, -1:, :]
+            residual = residual[:, -1:, :]
+            mask = None if mask is None else mask[:, -1:, :]
+
+        if pos_emb is not None:
+            x_att = self.self_attn(x_q, x, x, pos_emb, mask)
+        else:
+            x_att = self.self_attn(x_q, x, x, mask)
+
+        if self.concat_after:
+            x_concat = torch.cat((x, x_att), dim=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+            x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+
+        # convolution module
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x = residual + self.dropout(self.conv_module(x))
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+
+        if cache is not None:
+            x = torch.cat([cache, x], dim=1)
+
+        if pos_emb is not None:
+            return (x, pos_emb), mask
+
+        return x, mask
diff --git a/espnet/nets/pytorch_backend/conformer/subsampling.py b/espnet/nets/pytorch_backend/conformer/subsampling.py
new file mode 100644
index 00000000000..5d74f2e6d14
--- /dev/null
+++ b/espnet/nets/pytorch_backend/conformer/subsampling.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Subsampling layer definition."""
+
+import torch
+
+
+class Conv2dSubsampling(torch.nn.Module):
+    """Convolutional 2D subsampling (to 1/4 length).
+
+    Almost same as Conv2dSunbsampling in Transformer except
+    taking a positional encoding objective as input outside the class.
+    It will be more flexiable to choose different positional encoding types,
+    such as Sinusoidal positional encoding, Relative positional encoding, etc.
+
+    :param int idim: input dim
+    :param int odim: output dim
+    :param nn.Module pos_enc_class: positional encoding layer
+
+    """
+
+    def __init__(self, idim, odim, pos_enc_class):
+        """Construct an Conv2dSubsampling object."""
+        super(Conv2dSubsampling, self).__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+        )
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim), pos_enc_class,
+        )
+
+    def forward(self, x, x_mask):
+        """Subsample x.
+
+        :param torch.Tensor x: input tensor
+        :param torch.Tensor x_mask: input mask
+        :return: subsampled x and mask
+        :rtype Tuple[torch.Tensor, torch.Tensor]
+               or Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        if x_mask is None:
+            return x, None
+        return x, x_mask[:, :, :-2:2][:, :, :-2:2]
diff --git a/espnet/nets/pytorch_backend/e2e_asr_conformer.py b/espnet/nets/pytorch_backend/e2e_asr_conformer.py
new file mode 100644
index 00000000000..0115207727f
--- /dev/null
+++ b/espnet/nets/pytorch_backend/e2e_asr_conformer.py
@@ -0,0 +1,93 @@
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#                Northwestern Polytechnical University (Pengcheng Guo)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""
+Conformer speech recognition model (pytorch).
+
+It is a fusion of `e2e_asr_transformer.py`
+Refer to: https://arxiv.org/abs/2005.08100
+
+"""
+
+from distutils.util import strtobool
+
+from espnet.nets.pytorch_backend.conformer.encoder import Encoder
+from espnet.nets.pytorch_backend.e2e_asr_transformer import E2E as E2ETransformer
+
+
+class E2E(E2ETransformer):
+    """E2E module.
+
+    :param int idim: dimension of inputs
+    :param int odim: dimension of outputs
+    :param Namespace args: argument Namespace containing options
+
+    """
+
+    @staticmethod
+    def add_arguments(parser):
+        """Add arguments."""
+        E2ETransformer.add_arguments(parser)
+        E2E.add_conformer_arguments(parser)
+        return parser
+
+    @staticmethod
+    def add_conformer_arguments(parser):
+        """Add arguments for conformer model."""
+        group = parser.add_argument_group("conformer model specific setting")
+        group.add_argument(
+            "--transformer-encoder-pos-enc-layer-type",
+            type=str,
+            default="abs_pos",
+            choices=["abs_pos", "scaled_abs_pos", "rel_pos"],
+            help="transformer encoder positional encoding layer type",
+        )
+        group.add_argument(
+            "--macaron-style",
+            default=False,
+            type=strtobool,
+            help="Whether to use macaron style for positionwise layer",
+        )
+        # CNN module
+        group.add_argument(
+            "--use-cnn-module",
+            default=False,
+            type=strtobool,
+            help="Use convolution module or not",
+        )
+        group.add_argument(
+            "--cnn-module-kernel",
+            default=31,
+            type=int,
+            help="Kernel size of convolution module.",
+        )
+        return parser
+
+    def __init__(self, idim, odim, args, ignore_id=-1):
+        """Construct an E2E object.
+
+        :param int idim: dimension of inputs
+        :param int odim: dimension of outputs
+        :param Namespace args: argument Namespace containing options
+        """
+        super().__init__(idim, odim, args, ignore_id)
+        if args.transformer_attn_dropout_rate is None:
+            args.transformer_attn_dropout_rate = args.dropout_rate
+        self.encoder = Encoder(
+            idim=idim,
+            attention_dim=args.adim,
+            attention_heads=args.aheads,
+            linear_units=args.eunits,
+            num_blocks=args.elayers,
+            input_layer=args.transformer_input_layer,
+            dropout_rate=args.dropout_rate,
+            positional_dropout_rate=args.dropout_rate,
+            attention_dropout_rate=args.transformer_attn_dropout_rate,
+            pos_enc_layer_type=args.transformer_encoder_pos_enc_layer_type,
+            selfattention_layer_type=args.transformer_encoder_selfattn_layer_type,
+            macaron_style=args.macaron_style,
+            use_cnn_module=args.use_cnn_module,
+            cnn_module_kernel=args.cnn_module_kernel,
+        )
+        self.reset_parameters(args)
diff --git a/espnet/nets/pytorch_backend/e2e_asr_transformer.py b/espnet/nets/pytorch_backend/e2e_asr_transformer.py
index bd314cf0003..3dd7d242cec 100644
--- a/espnet/nets/pytorch_backend/e2e_asr_transformer.py
+++ b/espnet/nets/pytorch_backend/e2e_asr_transformer.py
@@ -23,7 +23,10 @@
 from espnet.nets.pytorch_backend.nets_utils import th_accuracy
 from espnet.nets.pytorch_backend.rnn.decoders import CTC_SCORING_RATIO
 from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos
-from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
+from espnet.nets.pytorch_backend.transformer.attention import (
+    MultiHeadedAttention,  # noqa: H301
+    RelPositionMultiHeadedAttention,  # noqa: H301
+)
 from espnet.nets.pytorch_backend.transformer.decoder import Decoder
 from espnet.nets.pytorch_backend.transformer.dynamic_conv import DynamicConvolution
 from espnet.nets.pytorch_backend.transformer.dynamic_conv2d import DynamicConvolution2D
@@ -103,6 +106,7 @@ def add_arguments(parser):
             default="selfattn",
             choices=[
                 "selfattn",
+                "rel_selfattn",
                 "lightconv",
                 "lightconv2d",
                 "dynamicconv",
@@ -130,13 +134,13 @@ def add_arguments(parser):
         # and https://arxiv.org/abs/1901.10430 for detail of the method.
         # Configurations used in the first paper are in
         # egs/{csj, librispeech}/asr1/conf/tuning/ld_conv/
-        parser.add_argument(
+        group.add_argument(
             "--wshare",
             default=4,
             type=int,
             help="Number of parameter shargin for lightweight convolution",
         )
-        parser.add_argument(
+        group.add_argument(
             "--ldconv-encoder-kernel-length",
             default="21_23_25_27_29_31_33_35_37_39_41_43",
             type=str,
@@ -144,7 +148,7 @@ def add_arguments(parser):
             'Encoder side. For example, "21_23_25" means kernel length 21 for '
             "First layer, 23 for Second layer and so on.",
         )
-        parser.add_argument(
+        group.add_argument(
             "--ldconv-decoder-kernel-length",
             default="11_13_15_17_19_21",
             type=str,
@@ -152,7 +156,7 @@ def add_arguments(parser):
             'Decoder side. For example, "21_23_25" means kernel length 21 for '
             "First layer, 23 for Second layer and so on.",
         )
-        parser.add_argument(
+        group.add_argument(
             "--ldconv-usebias",
             type=strtobool,
             default=False,
@@ -347,7 +351,7 @@ def forward(self, xs_pad, ilens, ys_pad):
                 cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
 
         # 5. compute cer/wer
-        if self.training or self.error_calculator is None:
+        if self.training or self.error_calculator is None or self.mtlalpha == 1.0:
             cer, wer = None, None
         else:
             ys_hat = pred_pad.argmax(dim=-1)
@@ -628,7 +632,11 @@ def calculate_all_attentions(self, xs_pad, ilens, ys_pad):
             self.forward(xs_pad, ilens, ys_pad)
         ret = dict()
         for name, m in self.named_modules():
-            if isinstance(m, MultiHeadedAttention) or isinstance(m, DynamicConvolution):
+            if (
+                isinstance(m, MultiHeadedAttention)
+                or isinstance(m, DynamicConvolution)
+                or isinstance(m, RelPositionMultiHeadedAttention)
+            ):
                 ret[name] = m.attn.cpu().numpy()
             if isinstance(m, DynamicConvolution2D):
                 ret[name + "_time"] = m.attn_t.cpu().numpy()
diff --git a/espnet/nets/pytorch_backend/e2e_mt.py b/espnet/nets/pytorch_backend/e2e_mt.py
index a868f5233fe..f031fd7b7c8 100644
--- a/espnet/nets/pytorch_backend/e2e_mt.py
+++ b/espnet/nets/pytorch_backend/e2e_mt.py
@@ -26,6 +26,7 @@
 from espnet.nets.pytorch_backend.rnn.attentions import att_for
 from espnet.nets.pytorch_backend.rnn.decoders import decoder_for
 from espnet.nets.pytorch_backend.rnn.encoders import encoder_for
+from espnet.utils.fill_missing_args import fill_missing_args
 
 
 class Reporter(chainer.Chain):
@@ -219,6 +220,10 @@ def __init__(self, idim, odim, args):
         """
         super(E2E, self).__init__()
         torch.nn.Module.__init__(self)
+
+        # fill missing arguments for compatibility
+        args = fill_missing_args(args, self.add_arguments)
+
         self.etype = args.etype
         self.verbose = args.verbose
         # NOTE: for self.build method
diff --git a/espnet/nets/pytorch_backend/e2e_mt_transformer.py b/espnet/nets/pytorch_backend/e2e_mt_transformer.py
index 4f383e604d9..06f843a0bad 100644
--- a/espnet/nets/pytorch_backend/e2e_mt_transformer.py
+++ b/espnet/nets/pytorch_backend/e2e_mt_transformer.py
@@ -33,6 +33,7 @@
 from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
 from espnet.nets.pytorch_backend.transformer.mask import target_mask
 from espnet.nets.pytorch_backend.transformer.plot import PlotAttentionReport
+from espnet.utils.fill_missing_args import fill_missing_args
 
 
 class E2E(MTInterface, torch.nn.Module):
@@ -86,7 +87,53 @@ def add_arguments(parser):
             type=strtobool,
             help="normalize loss by length",
         )
-
+        group.add_argument(
+            "--transformer-encoder-selfattn-layer-type",
+            type=str,
+            default="selfattn",
+            choices=["selfattn", "lightconv", "dynamicconv"],
+            help="transformer encoder self-attention layer type",
+        )
+        group.add_argument(
+            "--transformer-decoder-selfattn-layer-type",
+            type=str,
+            default="selfattn",
+            choices=["selfattn", "lightconv", "dynamicconv"],
+            help="transformer decoder self-attention layer type",
+        )
+        # Lightweight/Dynamic convolution related parameters.
+        # See https://arxiv.org/abs/1912.11793v2
+        # and https://arxiv.org/abs/1901.10430 for detail of the method.
+        # Configurations used in the first paper are in
+        # egs/{csj, librispeech}/asr1/conf/tuning/ld_conv/
+        group.add_argument(
+            "--wshare",
+            default=4,
+            type=int,
+            help="Number of parameter shargin for lightweight convolution",
+        )
+        group.add_argument(
+            "--ldconv-encoder-kernel-length",
+            default="21_23_25_27_29_31_33_35_37_39_41_43",
+            type=str,
+            help="kernel size for lightweight/dynamic convolution: "
+            'Encoder side. For example, "21_23_25" means kernel length 21 for '
+            "First layer, 23 for Second layer and so on.",
+        )
+        group.add_argument(
+            "--ldconv-decoder-kernel-length",
+            default="11_13_15_17_19_21",
+            type=str,
+            help="kernel size for lightweight/dynamic convolution: "
+            'Decoder side. For example, "21_23_25" means kernel length 21 for '
+            "First layer, 23 for Second layer and so on.",
+        )
+        group.add_argument(
+            "--ldconv-usebias",
+            type=strtobool,
+            default=False,
+            help="use bias term in lightweight/dynamic convolution",
+        )
         group.add_argument(
             "--dropout-rate",
             default=0.1,
@@ -139,12 +186,20 @@ def __init__(self, idim, odim, args, ignore_id=-1):
         :param Namespace args: argument Namespace containing options
         """
         torch.nn.Module.__init__(self)
+
+        # fill missing arguments for compatibility
+        args = fill_missing_args(args, self.add_arguments)
+
         if args.transformer_attn_dropout_rate is None:
             args.transformer_attn_dropout_rate = args.dropout_rate
         self.encoder = Encoder(
             idim=idim,
+            selfattention_layer_type=args.transformer_encoder_selfattn_layer_type,
             attention_dim=args.adim,
             attention_heads=args.aheads,
+            conv_wshare=args.wshare,
+            conv_kernel_length=args.ldconv_encoder_kernel_length,
+            conv_usebias=args.ldconv_usebias,
             linear_units=args.eunits,
             num_blocks=args.elayers,
             input_layer="embed",
@@ -154,8 +209,12 @@ def __init__(self, idim, odim, args, ignore_id=-1):
         )
         self.decoder = Decoder(
             odim=odim,
+            selfattention_layer_type=args.transformer_decoder_selfattn_layer_type,
             attention_dim=args.adim,
             attention_heads=args.aheads,
+            conv_wshare=args.wshare,
+            conv_kernel_length=args.ldconv_decoder_kernel_length,
+            conv_usebias=args.ldconv_usebias,
             linear_units=args.dunits,
             num_blocks=args.dlayers,
             dropout_rate=args.dropout_rate,
@@ -202,7 +261,6 @@ def __init__(self, idim, odim, args, ignore_id=-1):
 
     def reset_parameters(self, args):
         """Initialize parameters."""
-        # initialize parameters
         initialize(self, args.transformer_init)
         torch.nn.init.normal_(
             self.encoder.embed[0].weight, mean=0, std=args.adim ** -0.5
diff --git a/espnet/nets/pytorch_backend/e2e_st.py b/espnet/nets/pytorch_backend/e2e_st.py
index 0163c05abf9..7cef8fd1ed6 100644
--- a/espnet/nets/pytorch_backend/e2e_st.py
+++ b/espnet/nets/pytorch_backend/e2e_st.py
@@ -36,6 +36,7 @@
 from espnet.nets.pytorch_backend.rnn.decoders import decoder_for
 from espnet.nets.pytorch_backend.rnn.encoders import encoder_for
 from espnet.nets.st_interface import STInterface
+from espnet.utils.fill_missing_args import fill_missing_args
 
 CTC_LOSS_THRESHOLD = 10000
 
@@ -252,6 +253,10 @@ def __init__(self, idim, odim, args):
         """
         super(E2E, self).__init__()
         torch.nn.Module.__init__(self)
+
+        # fill missing arguments for compatibility
+        args = fill_missing_args(args, self.add_arguments)
+
         self.asr_weight = getattr(args, "asr_weight", 0)
         self.mt_weight = getattr(args, "mt_weight", 0)
         self.mtlalpha = args.mtlalpha
diff --git a/espnet/nets/pytorch_backend/e2e_st_transformer.py b/espnet/nets/pytorch_backend/e2e_st_transformer.py
index c1bd53bd42a..273426135b8 100644
--- a/espnet/nets/pytorch_backend/e2e_st_transformer.py
+++ b/espnet/nets/pytorch_backend/e2e_st_transformer.py
@@ -35,6 +35,7 @@
 from espnet.nets.pytorch_backend.transformer.mask import target_mask
 from espnet.nets.pytorch_backend.transformer.plot import PlotAttentionReport
 from espnet.nets.st_interface import STInterface
+from espnet.utils.fill_missing_args import fill_missing_args
 
 
 class E2E(STInterface, torch.nn.Module):
@@ -95,7 +96,67 @@ def add_arguments(parser):
             type=strtobool,
             help="normalize loss by length",
         )
-
+        group.add_argument(
+            "--transformer-encoder-selfattn-layer-type",
+            type=str,
+            default="selfattn",
+            choices=[
+                "selfattn",
+                "lightconv",
+                "lightconv2d",
+                "dynamicconv",
+                "dynamicconv2d",
+                "light-dynamicconv2d",
+            ],
+            help="transformer encoder self-attention layer type",
+        )
+        group.add_argument(
+            "--transformer-decoder-selfattn-layer-type",
+            type=str,
+            default="selfattn",
+            choices=[
+                "selfattn",
+                "lightconv",
+                "lightconv2d",
+                "dynamicconv",
+                "dynamicconv2d",
+                "light-dynamicconv2d",
+            ],
+            help="transformer decoder self-attention layer type",
+        )
+        # Lightweight/Dynamic convolution related parameters.
+        # See https://arxiv.org/abs/1912.11793v2
+        # and https://arxiv.org/abs/1901.10430 for detail of the method.
+        # Configurations used in the first paper are in
+        # egs/{csj, librispeech}/asr1/conf/tuning/ld_conv/
+        group.add_argument(
+            "--wshare",
+            default=4,
+            type=int,
+            help="Number of parameter shargin for lightweight convolution",
+        )
+        group.add_argument(
+            "--ldconv-encoder-kernel-length",
+            default="21_23_25_27_29_31_33_35_37_39_41_43",
+            type=str,
+            help="kernel size for lightweight/dynamic convolution: "
+            'Encoder side. For example, "21_23_25" means kernel length 21 for '
+            "First layer, 23 for Second layer and so on.",
+        )
+        group.add_argument(
+            "--ldconv-decoder-kernel-length",
+            default="11_13_15_17_19_21",
+            type=str,
+            help="kernel size for lightweight/dynamic convolution: "
+            'Decoder side. For example, "21_23_25" means kernel length 21 for '
+            "First layer, 23 for Second layer and so on.",
+        )
+        group.add_argument(
+            "--ldconv-usebias",
+            type=strtobool,
+            default=False,
+            help="use bias term in lightweight/dynamic convolution",
+        )
         group.add_argument(
             "--dropout-rate",
             default=0.1,
@@ -148,12 +209,20 @@ def __init__(self, idim, odim, args, ignore_id=-1):
         :param Namespace args: argument Namespace containing options
         """
         torch.nn.Module.__init__(self)
+
+        # fill missing arguments for compatibility
+        args = fill_missing_args(args, self.add_arguments)
+
         if args.transformer_attn_dropout_rate is None:
             args.transformer_attn_dropout_rate = args.dropout_rate
         self.encoder = Encoder(
             idim=idim,
+            selfattention_layer_type=args.transformer_encoder_selfattn_layer_type,
             attention_dim=args.adim,
             attention_heads=args.aheads,
+            conv_wshare=args.wshare,
+            conv_kernel_length=args.ldconv_encoder_kernel_length,
+            conv_usebias=args.ldconv_usebias,
             linear_units=args.eunits,
             num_blocks=args.elayers,
             input_layer=args.transformer_input_layer,
@@ -163,8 +232,12 @@ def __init__(self, idim, odim, args, ignore_id=-1):
         )
         self.decoder = Decoder(
             odim=odim,
+            selfattention_layer_type=args.transformer_decoder_selfattn_layer_type,
             attention_dim=args.adim,
             attention_heads=args.aheads,
+            conv_wshare=args.wshare,
+            conv_kernel_length=args.ldconv_decoder_kernel_length,
+            conv_usebias=args.ldconv_usebias,
             linear_units=args.dunits,
             num_blocks=args.dlayers,
             dropout_rate=args.dropout_rate,
@@ -186,7 +259,6 @@ def __init__(self, idim, odim, args, ignore_id=-1):
             args.lsm_weight,
             args.transformer_length_normalized_loss,
         )
-        self.adim = args.adim
         # submodule for ASR task
         self.mtlalpha = args.mtlalpha
         self.asr_weight = getattr(args, "asr_weight", 0.0)
@@ -202,6 +274,7 @@ def __init__(self, idim, odim, args, ignore_id=-1):
                 self_attention_dropout_rate=args.transformer_attn_dropout_rate,
                 src_attention_dropout_rate=args.transformer_attn_dropout_rate,
             )
+
         # submodule for MT task
         self.mt_weight = getattr(args, "mt_weight", 0.0)
         if self.mt_weight > 0:
@@ -217,7 +290,8 @@ def __init__(self, idim, odim, args, ignore_id=-1):
                 attention_dropout_rate=args.transformer_attn_dropout_rate,
                 padding_idx=0,
             )
-        self.reset_parameters(args)  # place after the submodule initialization
+        self.reset_parameters(args)  # NOTE: place after the submodule initialization
+        self.adim = args.adim  # used for CTC (equal to d_model)
         if self.asr_weight > 0 and args.mtlalpha > 0.0:
             self.ctc = CTC(
                 odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=True
diff --git a/espnet/nets/pytorch_backend/transformer/attention.py b/espnet/nets/pytorch_backend/transformer/attention.py
index 41a6954dff2..d6ab8d2e491 100644
--- a/espnet/nets/pytorch_backend/transformer/attention.py
+++ b/espnet/nets/pytorch_backend/transformer/attention.py
@@ -36,16 +36,14 @@ def __init__(self, n_head, n_feat, dropout_rate):
         self.attn = None
         self.dropout = nn.Dropout(p=dropout_rate)
 
-    def forward(self, query, key, value, mask):
-        """Compute 'Scaled Dot Product Attention'.
+    def forward_qkv(self, query, key, value):
+        """Transform query, key and value.
 
         :param torch.Tensor query: (batch, time1, size)
         :param torch.Tensor key: (batch, time2, size)
         :param torch.Tensor value: (batch, time2, size)
-        :param torch.Tensor mask: (batch, time1, time2)
-        :param torch.nn.Dropout dropout:
-        :return torch.Tensor: attentined and transformed `value` (batch, time1, d_model)
-             weighted by the query dot key attention (batch, head, time1, time2)
+        :return torch.Tensor transformed query, key and value
+
         """
         n_batch = query.size(0)
         q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
@@ -55,9 +53,19 @@ def forward(self, query, key, value, mask):
         k = k.transpose(1, 2)  # (batch, head, time2, d_k)
         v = v.transpose(1, 2)  # (batch, head, time2, d_k)
 
-        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(
-            self.d_k
-        )  # (batch, head, time1, time2)
+        return q, k, v
+
+    def forward_attention(self, value, scores, mask):
+        """Compute attention context vector.
+
+        :param torch.Tensor value: (batch, time2, size)
+        :param torch.Tensor scores: (batch, time1, time2)
+        :param torch.Tensor mask: (batch, time1, time2)
+        :return torch.Tensor transformed `value` (batch, time2, d_model)
+            weighted by the attention score (batch, time1, time2)
+
+        """
+        n_batch = value.size(0)
         if mask is not None:
             mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
             min_value = float(
@@ -71,8 +79,107 @@ def forward(self, query, key, value, mask):
             self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
 
         p_attn = self.dropout(self.attn)
-        x = torch.matmul(p_attn, v)  # (batch, head, time1, d_k)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
         x = (
             x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
         )  # (batch, time1, d_model)
+
         return self.linear_out(x)  # (batch, time1, d_model)
+
+    def forward(self, query, key, value, mask):
+        """Compute 'Scaled Dot Product Attention'.
+
+        :param torch.Tensor query: (batch, time1, size)
+        :param torch.Tensor key: (batch, time2, size)
+        :param torch.Tensor value: (batch, time2, size)
+        :param torch.Tensor mask: (batch, time1, time2)
+        :param torch.nn.Dropout dropout:
+        :return torch.Tensor: attentined and transformed `value` (batch, time1, d_model)
+             weighted by the query dot key attention (batch, head, time1, time2)
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask)
+
+
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding.
+
+    Paper: https://arxiv.org/abs/1901.02860
+
+    :param int n_head: the number of head s
+    :param int n_feat: the number of features
+    :param float dropout_rate: dropout rate
+
+    """
+
+    def __init__(self, n_head, n_feat, dropout_rate):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate)
+        # linear transformation for positional ecoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+
+    def rel_shift(self, x, zero_triu=False):
+        """Compute relative positinal encoding.
+
+        :param torch.Tensor x: (batch, time, size)
+        :param bool zero_triu: return the lower triangular part of the matrix
+        """
+        zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+
+        x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(2), x.size(3)))
+            x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+
+        return x
+
+    def forward(self, query, key, value, pos_emb, mask):
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+
+        :param torch.Tensor query: (batch, time1, size)
+        :param torch.Tensor key: (batch, time2, size)
+        :param torch.Tensor value: (batch, time2, size)
+        :param torch.Tensor pos_emb: (batch, time1, size)
+        :param torch.Tensor mask: (batch, time1, time2)
+        :param torch.nn.Dropout dropout:
+        :return torch.Tensor: attentined and transformed `value` (batch, time1, d_model)
+             weighted by the query dot key attention (batch, head, time1, time2)
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+
+        # compute matrix b and matrix d
+        # (batch, head, time1, time2)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        matrix_bd = self.rel_shift(matrix_bd)
+
+        scores = (matrix_ac + matrix_bd) / math.sqrt(
+            self.d_k
+        )  # (batch, head, time1, time2)
+
+        return self.forward_attention(v, scores, mask)
diff --git a/espnet/nets/pytorch_backend/transformer/embedding.py b/espnet/nets/pytorch_backend/transformer/embedding.py
index 0f45f5ccf49..fa3199cf7e3 100644
--- a/espnet/nets/pytorch_backend/transformer/embedding.py
+++ b/espnet/nets/pytorch_backend/transformer/embedding.py
@@ -38,13 +38,15 @@ class PositionalEncoding(torch.nn.Module):
     :param int d_model: embedding dim
     :param float dropout_rate: dropout rate
     :param int max_len: maximum input length
+    :param reverse: whether to reverse the input position
 
     """
 
-    def __init__(self, d_model, dropout_rate, max_len=5000):
+    def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
         """Construct an PositionalEncoding object."""
         super(PositionalEncoding, self).__init__()
         self.d_model = d_model
+        self.reverse = reverse
         self.xscale = math.sqrt(self.d_model)
         self.dropout = torch.nn.Dropout(p=dropout_rate)
         self.pe = None
@@ -59,7 +61,12 @@ def extend_pe(self, x):
                     self.pe = self.pe.to(dtype=x.dtype, device=x.device)
                 return
         pe = torch.zeros(x.size(1), self.d_model)
-        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        if self.reverse:
+            position = torch.arange(
+                x.size(1) - 1, -1, -1.0, dtype=torch.float32
+            ).unsqueeze(1)
+        else:
+            position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
         div_term = torch.exp(
             torch.arange(0, self.d_model, 2, dtype=torch.float32)
             * -(math.log(10000.0) / self.d_model)
@@ -119,3 +126,41 @@ def forward(self, x):
         self.extend_pe(x)
         x = x + self.alpha * self.pe[:, : x.size(1)]
         return self.dropout(x)
+
+
+class RelPositionalEncoding(PositionalEncoding):
+    """Relitive positional encoding module.
+
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+
+    :param int d_model: embedding dim
+    :param float dropout_rate: dropout rate
+    :param int max_len: maximum input length
+
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Initialize class.
+
+        :param int d_model: embedding dim
+        :param float dropout_rate: dropout rate
+        :param int max_len: maximum input length
+
+        """
+        super().__init__(d_model, dropout_rate, max_len, reverse=True)
+
+    def forward(self, x):
+        """Compute positional encoding.
+
+        Args:
+            x (torch.Tensor): Input. Its shape is (batch, time, ...)
+
+        Returns:
+            torch.Tensor: x. Its shape is (batch, time, ...)
+            torch.Tensor: pos_emb. Its shape is (1, time, ...)
+
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        pos_emb = self.pe[:, : x.size(1)]
+        return self.dropout(x), self.dropout(pos_emb)
diff --git a/espnet/st/pytorch_backend/st.py b/espnet/st/pytorch_backend/st.py
index e4024911c69..910e7e4969c 100644
--- a/espnet/st/pytorch_backend/st.py
+++ b/espnet/st/pytorch_backend/st.py
@@ -6,7 +6,6 @@
 
 """Training/decoding definition for the speech translation task."""
 
-import copy
 import json
 import logging
 import os
@@ -87,18 +86,31 @@ def __call__(self, batch, device=torch.device("cpu")):
             tuple(torch.Tensor, torch.Tensor, torch.Tensor)
 
         """
-        _, ys = batch[0]
-        ys_asr = copy.deepcopy(ys)
-        xs_pad, ilens, ys_pad = super().__call__(batch, device)
+        # batch should be located in list
+        assert len(batch) == 1
+        xs, ys, ys_src = batch[0]
+
+        # get batch of lengths of input sequences
+        ilens = np.array([x.shape[0] for x in xs])
+        ilens = torch.from_numpy(ilens).to(device)
+
+        xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0).to(
+            device, dtype=self.dtype
+        )
+
+        ys_pad = pad_list(
+            [torch.from_numpy(np.array(y, dtype=np.int64)) for y in ys], self.ignore_id,
+        ).to(device)
+
         if self.use_source_text:
-            ys_pad_asr = pad_list(
-                [torch.from_numpy(np.array(y[1])).long() for y in ys_asr],
+            ys_pad_src = pad_list(
+                [torch.from_numpy(np.array(y, dtype=np.int64)) for y in ys_src],
                 self.ignore_id,
             ).to(device)
         else:
-            ys_pad_asr = None
+            ys_pad_src = None
 
-        return xs_pad, ilens, ys_pad, ys_pad_asr
+        return xs_pad, ilens, ys_pad, ys_pad_src
 
 
 def train(args):
@@ -131,17 +143,10 @@ def train(args):
         model = model_class(idim, odim, args)
     assert isinstance(model, STInterface)
 
-    subsampling_factor = model.subsample[0]
-
     if args.rnnlm is not None:
         rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
         rnnlm = lm_pytorch.ClassifierWithState(
-            lm_pytorch.RNNLM(
-                len(args.char_list),
-                rnnlm_args.layer,
-                rnnlm_args.unit,
-                getattr(rnnlm_args, "embed_unit", None),  # for backward compatibility
-            )
+            lm_pytorch.RNNLM(len(args.char_list), rnnlm_args.layer, rnnlm_args.unit)
         )
         torch_load(args.rnnlm, rnnlm)
         model.rnnlm = rnnlm
@@ -228,7 +233,7 @@ def train(args):
 
     # Setup a converter
     converter = CustomConverter(
-        subsampling_factor=subsampling_factor,
+        subsampling_factor=model.subsample[0],
         dtype=dtype,
         use_source_text=args.asr_weight > 0 or args.mt_weight > 0,
     )
@@ -254,6 +259,8 @@ def train(args):
         batch_frames_in=args.batch_frames_in,
         batch_frames_out=args.batch_frames_out,
         batch_frames_inout=args.batch_frames_inout,
+        iaxis=0,
+        oaxis=0,
     )
     valid = make_batchset(
         valid_json,
@@ -267,6 +274,8 @@ def train(args):
         batch_frames_in=args.batch_frames_in,
         batch_frames_out=args.batch_frames_out,
         batch_frames_inout=args.batch_frames_inout,
+        iaxis=0,
+        oaxis=0,
     )
 
     load_tr = LoadInputsAndTargets(
@@ -285,30 +294,26 @@ def train(args):
     # actual bathsize is included in a list
     # default collate function converts numpy array to pytorch tensor
     # we used an empty collate function instead which returns list
-    train_iter = {
-        "main": ChainerDataLoader(
-            dataset=TransformDataset(train, lambda data: converter([load_tr(data)])),
-            batch_size=1,
-            num_workers=args.n_iter_processes,
-            shuffle=not use_sortagrad,
-            collate_fn=lambda x: x[0],
-        )
-    }
-    valid_iter = {
-        "main": ChainerDataLoader(
-            dataset=TransformDataset(valid, lambda data: converter([load_cv(data)])),
-            batch_size=1,
-            shuffle=False,
-            collate_fn=lambda x: x[0],
-            num_workers=args.n_iter_processes,
-        )
-    }
+    train_iter = ChainerDataLoader(
+        dataset=TransformDataset(train, lambda data: converter([load_tr(data)])),
+        batch_size=1,
+        num_workers=args.n_iter_processes,
+        shuffle=not use_sortagrad,
+        collate_fn=lambda x: x[0],
+    )
+    valid_iter = ChainerDataLoader(
+        dataset=TransformDataset(valid, lambda data: converter([load_cv(data)])),
+        batch_size=1,
+        shuffle=False,
+        collate_fn=lambda x: x[0],
+        num_workers=args.n_iter_processes,
+    )
 
     # Set up a trainer
     updater = CustomUpdater(
         model,
         args.grad_clip,
-        train_iter,
+        {"main": train_iter},
         optimizer,
         device,
         args.ngpu,
@@ -332,11 +337,13 @@ def train(args):
     # Evaluate the model with the test dataset for each epoch
     if args.save_interval_iters > 0:
         trainer.extend(
-            CustomEvaluator(model, valid_iter, reporter, device, args.ngpu),
+            CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu),
             trigger=(args.save_interval_iters, "iteration"),
         )
     else:
-        trainer.extend(CustomEvaluator(model, valid_iter, reporter, device, args.ngpu))
+        trainer.extend(
+            CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu)
+        )
 
     # Save attention weight each epoch
     if args.num_save_attention > 0:
diff --git a/espnet2/bin/asr_inference.py b/espnet2/bin/asr_inference.py
index 8d7baffd5d1..4e1c35bc48b 100755
--- a/espnet2/bin/asr_inference.py
+++ b/espnet2/bin/asr_inference.py
@@ -1,14 +1,18 @@
 #!/usr/bin/env python3
+import argparse
 import logging
+from pathlib import Path
 import sys
 from typing import Optional
 from typing import Sequence
 from typing import Tuple
 from typing import Union
 
-import configargparse
+import numpy as np
 import torch
 from typeguard import check_argument_types
+from typeguard import check_return_type
+from typing import List
 
 from espnet.nets.beam_search import BeamSearch
 from espnet.nets.beam_search import Hypothesis
@@ -22,11 +26,178 @@
 from espnet2.text.token_id_converter import TokenIDConverter
 from espnet2.torch_utils.device_funcs import to_device
 from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.utils import config_argparse
 from espnet2.utils.types import str2bool
 from espnet2.utils.types import str2triple_str
 from espnet2.utils.types import str_or_none
 
 
+class Speech2Text:
+    """Speech2Text class
+
+    Examples:
+        >>> import soundfile
+        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+        >>> audio, rate = soundfile.read("speech.wav")
+        >>> speech2text(audio)
+        [(text, token, token_int, hypothesis object), ...]
+
+    """
+
+    def __init__(
+        self,
+        asr_train_config: Union[Path, str],
+        asr_model_file: Union[Path, str] = None,
+        lm_train_config: Union[Path, str] = None,
+        lm_file: Union[Path, str] = None,
+        token_type: str = None,
+        bpemodel: str = None,
+        device: str = "cpu",
+        maxlenratio: float = 0.0,
+        minlenratio: float = 0.0,
+        dtype: str = "float32",
+        beam_size: int = 20,
+        ctc_weight: float = 0.5,
+        lm_weight: float = 1.0,
+        penalty: float = 0.0,
+        nbest: int = 1,
+    ):
+        assert check_argument_types()
+
+        # 1. Build ASR model
+        scorers = {}
+        asr_model, asr_train_args = ASRTask.build_model_from_file(
+            asr_train_config, asr_model_file, device
+        )
+        asr_model.eval()
+
+        decoder = asr_model.decoder
+        ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+        token_list = asr_model.token_list
+        scorers.update(
+            decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)),
+        )
+
+        # 2. Build Language model
+        if lm_train_config is not None:
+            lm, lm_train_args = LMTask.build_model_from_file(
+                lm_train_config, lm_file, device
+            )
+            scorers["lm"] = lm.lm
+
+        # 3. Build BeamSearch object
+        weights = dict(
+            decoder=1.0 - ctc_weight,
+            ctc=ctc_weight,
+            lm=lm_weight,
+            length_bonus=penalty,
+        )
+        beam_search = BeamSearch(
+            beam_size=beam_size,
+            weights=weights,
+            scorers=scorers,
+            sos=asr_model.sos,
+            eos=asr_model.eos,
+            vocab_size=len(token_list),
+            token_list=token_list,
+        )
+        beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
+        for scorer in scorers.values():
+            if isinstance(scorer, torch.nn.Module):
+                scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
+        logging.info(f"Beam_search: {beam_search}")
+        logging.info(f"Decoding device={device}, dtype={dtype}")
+
+        # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text
+        if token_type is None:
+            token_type = asr_train_args.token_type
+        if bpemodel is None:
+            bpemodel = asr_train_args.bpemodel
+
+        if token_type is None:
+            tokenizer = None
+        elif token_type == "bpe":
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+        converter = TokenIDConverter(token_list=token_list)
+        logging.info(f"Text tokenizer: {tokenizer}")
+
+        self.asr_model = asr_model
+        self.asr_train_args = asr_train_args
+        self.lm_train_args = lm_train_args
+        self.converter = converter
+        self.tokenizer = tokenizer
+        self.beam_search = beam_search
+        self.maxlenratio = maxlenratio
+        self.minlenratio = minlenratio
+        self.device = device
+        self.dtype = dtype
+        self.nbest = nbest
+
+    @torch.no_grad()
+    def __call__(
+        self, speech: Union[torch.Tensor, np.ndarray]
+    ) -> List[Tuple[Optional[str], List[str], List[int], Hypothesis]]:
+        """Inference
+
+        Args:
+            data: Input speech data
+        Returns:
+            text, token, token_int, hyp
+
+        """
+        assert check_argument_types()
+
+        # Input as audio signal
+        if isinstance(speech, np.ndarray):
+            speech = torch.tensor(speech)
+
+        # data: (Nsamples,) -> (1, Nsamples)
+        speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+        # lenghts: (1,)
+        lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+        batch = {"speech": speech, "speech_lengths": lengths}
+
+        # a. To device
+        batch = to_device(batch, device=self.device)
+
+        # b. Forward Encoder
+        enc, _ = self.asr_model.encode(**batch)
+        assert len(enc) == 1, len(enc)
+
+        # c. Passed the encoder result and the beam search
+        nbest_hyps = self.beam_search(
+            x=enc[0], maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
+        )
+        nbest_hyps = nbest_hyps[: self.nbest]
+
+        results = []
+        for hyp in nbest_hyps:
+            assert isinstance(hyp, Hypothesis), type(hyp)
+
+            # remove sos/eos and get results
+            token_int = hyp.yseq[1:-1].tolist()
+
+            # remove blank symbol id, which is assumed to be 0
+            token_int = list(filter(lambda x: x != 0, token_int))
+
+            # Change integer-ids to tokens
+            token = self.converter.ids2tokens(token_int)
+
+            if self.tokenizer is not None:
+                text = self.tokenizer.tokens2text(token)
+            else:
+                text = None
+            results.append((text, token, token_int, hyp))
+
+        assert check_return_type(results)
+        return results
+
+
 def inference(
     output_dir: str,
     maxlenratio: float,
@@ -50,7 +221,6 @@ def inference(
     lm_file: Optional[str],
     word_lm_train_config: Optional[str],
     word_lm_file: Optional[str],
-    blank_symbol: str,
     token_type: Optional[str],
     bpemodel: Optional[str],
     allow_variable_data_keys: bool,
@@ -76,78 +246,38 @@ def inference(
     # 1. Set random-seed
     set_all_random_seed(seed)
 
-    # 2. Build ASR model
-    scorers = {}
-    asr_model, asr_train_args = ASRTask.build_model_from_file(
-        asr_train_config, asr_model_file, device
-    )
-    asr_model.eval()
-
-    decoder = asr_model.decoder
-    ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
-    token_list = asr_model.token_list
-    scorers.update(
-        decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)),
-    )
-
-    # 3. Build Language model
-    if lm_train_config is not None:
-        lm, lm_train_args = LMTask.build_model_from_file(
-            lm_train_config, lm_file, device
-        )
-        scorers["lm"] = lm.lm
-
-    # 4. Build BeamSearch object
-    weights = dict(
-        decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty,
-    )
-    beam_search = BeamSearch(
+    # 2. Build speech2text
+    speech2text = Speech2Text(
+        asr_train_config=asr_train_config,
+        asr_model_file=asr_model_file,
+        lm_train_config=lm_train_config,
+        lm_file=lm_file,
+        token_type=token_type,
+        bpemodel=bpemodel,
+        device=device,
+        maxlenratio=maxlenratio,
+        minlenratio=minlenratio,
+        dtype=dtype,
         beam_size=beam_size,
-        weights=weights,
-        scorers=scorers,
-        sos=asr_model.sos,
-        eos=asr_model.eos,
-        vocab_size=len(token_list),
-        token_list=token_list,
+        ctc_weight=ctc_weight,
+        lm_weight=lm_weight,
+        penalty=penalty,
+        nbest=nbest,
     )
-    beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
-    for scorer in scorers.values():
-        if isinstance(scorer, torch.nn.Module):
-            scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
-    logging.info(f"Beam_search: {beam_search}")
-    logging.info(f"Decoding device={device}, dtype={dtype}")
-
-    # 5. Build data-iterator
+
+    # 3. Build data-iterator
     loader = ASRTask.build_streaming_iterator(
         data_path_and_name_and_type,
         dtype=dtype,
         batch_size=batch_size,
         key_file=key_file,
         num_workers=num_workers,
-        preprocess_fn=ASRTask.build_preprocess_fn(asr_train_args, False),
-        collate_fn=ASRTask.build_collate_fn(asr_train_args),
+        preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
+        collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args),
         allow_variable_data_keys=allow_variable_data_keys,
         inference=True,
     )
 
-    # 6. [Optional] Build Text converter: e.g. bpe-sym -> Text
-    if token_type is None:
-        token_type = asr_train_args.token_type
-    if bpemodel is None:
-        bpemodel = asr_train_args.bpemodel
-
-    if token_type is None:
-        tokenizer = None
-    elif token_type == "bpe":
-        if bpemodel is not None:
-            tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
-        else:
-            tokenizer = None
-    else:
-        tokenizer = build_tokenizer(token_type=token_type)
-    converter = TokenIDConverter(token_list=token_list)
-    logging.info(f"Text tokenizer: {tokenizer}")
-
     # 7 .Start for-loop
     # FIXME(kamo): The output format should be discussed about
     with DatadirWriter(output_dir) as writer:
@@ -156,60 +286,34 @@ def inference(
             assert all(isinstance(s, str) for s in keys), keys
             _bs = len(next(iter(batch.values())))
             assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+            batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
 
-            with torch.no_grad():
-                # a. To device
-                batch = to_device(batch, device)
-
-                # b. Forward Encoder
-                enc, _ = asr_model.encode(**batch)
-                assert len(enc) == batch_size, len(enc)
-
-                # c. Passed the encoder result and the beam search
-                nbest_hyps = beam_search(
-                    x=enc[0], maxlenratio=maxlenratio, minlenratio=minlenratio
-                )
-                nbest_hyps = nbest_hyps[:nbest]
+            # N-best list of (text, token, token_int, hyp_object)
+            results = speech2text(**batch)
 
             # Only supporting batch_size==1
             key = keys[0]
-            for n in range(1, nbest + 1):
-                hyp = nbest_hyps[n - 1]
-                assert isinstance(hyp, Hypothesis), type(hyp)
-
-                # remove sos/eos and get results
-                token_int = hyp.yseq[1:-1].tolist()
-
-                # remove blank symbol id, which is assumed to be 0
-                token_int = list(filter(lambda x: x != 0, token_int))
-
-                # Change integer-ids to tokens
-                token = converter.ids2tokens(token_int)
-
+            for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
                 # Create a directory: outdir/{n}best_recog
                 ibest_writer = writer[f"{n}best_recog"]
 
-                # Write the result to each files
+                # Write the result to each file
                 ibest_writer["token"][key] = " ".join(token)
                 ibest_writer["token_int"][key] = " ".join(map(str, token_int))
                 ibest_writer["score"][key] = str(hyp.score)
 
-                if tokenizer is not None:
-                    text = tokenizer.tokens2text(token)
+                if text is not None:
                     ibest_writer["text"][key] = text
 
 
 def get_parser():
-    parser = configargparse.ArgumentParser(
+    parser = config_argparse.ArgumentParser(
         description="ASR Decoding",
-        config_file_parser_class=configargparse.YAMLConfigFileParser,
-        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
 
     # Note(kamo): Use '_' instead of '-' as separator.
     # '-' is confusing if written in yaml.
-    parser.add_argument("--config", is_config_file=True, help="config file path")
-
     parser.add_argument(
         "--log_level",
         type=lambda x: x.upper(),
@@ -280,12 +384,6 @@ def get_parser():
         "--ctc_weight", type=float, default=0.5, help="CTC weight in joint decoding",
     )
     group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
-    group.add_argument(
-        "--blank_symbol",
-        type=str,
-        default="<blank>",
-        help="The token symbol represents CTC-blank",
-    )
 
     group = parser.add_argument_group("Text converter related")
     group.add_argument(
diff --git a/espnet2/bin/lm_calc_perplexity.py b/espnet2/bin/lm_calc_perplexity.py
index 6d66d6da8a7..a892df9d7af 100755
--- a/espnet2/bin/lm_calc_perplexity.py
+++ b/espnet2/bin/lm_calc_perplexity.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import argparse
 import logging
 from pathlib import Path
 import sys
@@ -7,7 +8,6 @@
 from typing import Tuple
 from typing import Union
 
-import configargparse
 import numpy as np
 import torch
 from torch.nn.parallel import data_parallel
@@ -19,6 +19,7 @@
 from espnet2.torch_utils.device_funcs import to_device
 from espnet2.torch_utils.forward_adaptor import ForwardAdaptor
 from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.utils import config_argparse
 from espnet2.utils.types import float_or_none
 from espnet2.utils.types import str2bool
 from espnet2.utils.types import str2triple_str
@@ -130,16 +131,13 @@ def calc_perplexity(
 
 
 def get_parser():
-    parser = configargparse.ArgumentParser(
+    parser = config_argparse.ArgumentParser(
         description="Calc perplexity",
-        config_file_parser_class=configargparse.YAMLConfigFileParser,
-        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
 
     # Note(kamo): Use '_' instead of '-' as separator.
     # '-' is confusing if written in yaml.
-    parser.add_argument("--config", is_config_file=True, help="config file path")
-
     parser.add_argument(
         "--log_level",
         type=lambda x: x.upper(),
diff --git a/espnet2/bin/pack.py b/espnet2/bin/pack.py
index f81e2f5ed08..92695f09d91 100755
--- a/espnet2/bin/pack.py
+++ b/espnet2/bin/pack.py
@@ -10,13 +10,13 @@ class PackedContents:
 
 
 class ASRPackedContents(PackedContents):
-    files = ["asr_model_file.pth", "lm_file.pth"]
-    yaml_files = ["asr_train_config.yaml", "lm_train_config.yaml"]
+    files = ["asr/pretrain.pth", "lm/pretrain.pth"]
+    yaml_files = ["asr/config.yaml", "lm/config.yaml"]
 
 
 class TTSPackedContents(PackedContents):
-    files = ["model_file.pth"]
-    yaml_files = ["train_config.yaml"]
+    files = ["pretrain.pth"]
+    yaml_files = ["config.yaml"]
 
 
 def add_arguments(parser: argparse.ArgumentParser, contents: Type[PackedContents]):
@@ -26,13 +26,7 @@ def add_arguments(parser: argparse.ArgumentParser, contents: Type[PackedContents
     for key in contents.files:
         parser.add_argument(f"--{key}", type=str, default=None)
     parser.add_argument("--option", type=str, action="append", default=[])
-    parser.add_argument(
-        "--mode",
-        type=str,
-        default="w:gz",
-        choices=["w", "w:gz", "w:bz2", "w:xz"],
-        help="Compression mode",
-    )
+    parser.add_argument("--dirname", type=str, default="Base dirname in archived file")
 
 
 def get_parser() -> argparse.ArgumentParser:
@@ -72,8 +66,8 @@ def main(cmd=None):
         yaml_files=yaml_files,
         files=files,
         option=args.option,
+        dirname=args.dirname,
         outpath=args.outpath,
-        mode=args.mode,
     )
 
 
diff --git a/espnet2/bin/tokenize_text.py b/espnet2/bin/tokenize_text.py
index dad51f64612..e7b4bffa3de 100755
--- a/espnet2/bin/tokenize_text.py
+++ b/espnet2/bin/tokenize_text.py
@@ -133,9 +133,18 @@ def tokenize(
         return
 
     # ======= write_vocabulary mode from here =======
-    # Sort by the number of occurrences
-    words_and_counts = list(sorted(counter.items(), key=lambda x: x[1]))
+    # Sort by the number of occurrences in descending order
+    # and filter lower frequency words than cutoff value
+    words_and_counts = list(
+        filter(lambda x: x[1] > cutoff, sorted(counter.items(), key=lambda x: -x[1]))
+    )
+    # Restrict the vocabulary size
+    if vocabulary_size > 0:
+        if vocabulary_size < len(add_symbol):
+            raise RuntimeError(f"vocabulary_size is too small: {vocabulary_size}")
+        words_and_counts = words_and_counts[: vocabulary_size - len(add_symbol)]
 
+    # Parse the values of --add_symbol
     for symbol_and_id in add_symbol:
         # e.g symbol="<blank>:0"
         try:
@@ -151,19 +160,13 @@ def tokenize(
             idx = len(words_and_counts) + 1 + idx
         words_and_counts.insert(idx, (symbol, None))
 
-    total_count = sum(counter.values())
-    invocab_count = 0
-    for nvocab, (w, c) in enumerate(words_and_counts, 1):
+    # Write words
+    for w, c in words_and_counts:
         fout.write(w + "\n")
-        if c is not None:
-            invocab_count += c
-            if c <= cutoff:
-                break
-
-        # Note that nvocab includes appended symbol, e.g. even <blank> or <sos/eos>
-        if nvocab >= vocabulary_size > 0:
-            break
 
+    # Logging
+    total_count = sum(counter.values())
+    invocab_count = sum(c for w, c in words_and_counts if c is not None)
     logging.info(f"OOV rate = {(total_count - invocab_count) / total_count * 100} %")
 
 
diff --git a/espnet2/bin/tts_inference.py b/espnet2/bin/tts_inference.py
old mode 100755
new mode 100644
index 1f89795b6e5..cb4b3a1fbf9
--- a/espnet2/bin/tts_inference.py
+++ b/espnet2/bin/tts_inference.py
@@ -2,8 +2,10 @@
 
 """TTS mode decoding."""
 
+import argparse
 import logging
 from pathlib import Path
+import shutil
 import sys
 import time
 from typing import Optional
@@ -11,7 +13,6 @@
 from typing import Tuple
 from typing import Union
 
-import configargparse
 import matplotlib
 import numpy as np
 import soundfile as sf
@@ -23,7 +24,11 @@
 from espnet2.tasks.tts import TTSTask
 from espnet2.torch_utils.device_funcs import to_device
 from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.tts.duration_calculator import DurationCalculator
+from espnet2.tts.fastspeech import FastSpeech
 from espnet2.tts.tacotron2 import Tacotron2
+from espnet2.tts.transformer import Transformer
+from espnet2.utils import config_argparse
 from espnet2.utils.get_default_kwargs import get_default_kwargs
 from espnet2.utils.griffin_lim import Spectrogram2Waveform
 from espnet2.utils.nested_dict_action import NestedDictAction
@@ -32,7 +37,143 @@
 from espnet2.utils.types import str_or_none
 
 
-@torch.no_grad()
+class Text2Speech:
+    """Speech2Text class
+
+    Examples:
+        >>> import soundfile
+        >>> text2speech = Text2Speech("config.yml", "model.pth")
+        >>> wav = text2speech("Hello World")[0]
+        >>> soundfile.write("out.wav", wav.numpy(), text2speech.fs, "PCM_16")
+
+    """
+
+    def __init__(
+        self,
+        train_config: Optional[Union[Path, str]],
+        model_file: Optional[Union[Path, str]] = None,
+        threshold: float = 0.5,
+        minlenratio: float = 0.0,
+        maxlenratio: float = 10.0,
+        use_teacher_forcing: bool = False,
+        use_att_constraint: bool = False,
+        backward_window: int = 1,
+        forward_window: int = 3,
+        speed_control_alpha: float = 1.0,
+        vocoder_conf: dict = None,
+        dtype: str = "float32",
+        device: str = "cpu",
+    ):
+        assert check_argument_types()
+
+        model, train_args = TTSTask.build_model_from_file(
+            train_config, model_file, device
+        )
+        model.to(dtype=getattr(torch, dtype)).eval()
+        self.device = device
+        self.dtype = dtype
+        self.train_args = train_args
+        self.model = model
+        self.tts = model.tts
+        self.normalize = model.normalize
+        self.feats_extract = model.feats_extract
+        self.duration_calculator = DurationCalculator()
+        self.preprocess_fn = TTSTask.build_preprocess_fn(train_args, False)
+        self.use_teacher_forcing = use_teacher_forcing
+
+        logging.info(f"Normalization:\n{self.normalize}")
+        logging.info(f"TTS:\n{self.tts}")
+
+        decode_config = {}
+        if isinstance(self.tts, (Tacotron2, Transformer)):
+            decode_config.update(
+                {
+                    "threshold": threshold,
+                    "maxlenratio": maxlenratio,
+                    "minlenratio": minlenratio,
+                    "use_teacher_forcing": use_teacher_forcing,
+                }
+            )
+        if isinstance(self.tts, Tacotron2):
+            decode_config.update(
+                {
+                    "use_att_constraint": use_att_constraint,
+                    "forward_window": forward_window,
+                    "backward_window": backward_window,
+                }
+            )
+        if isinstance(self.tts, FastSpeech):
+            decode_config.update({"alpha": speed_control_alpha})
+        self.decode_config = decode_config
+
+        if vocoder_conf is None:
+            vocoder_conf = {}
+        if self.feats_extract is not None:
+            vocoder_conf.update(self.feats_extract.get_parameters())
+        if (
+            "n_fft" in vocoder_conf
+            and "n_shift" in vocoder_conf
+            and "fs" in vocoder_conf
+        ):
+            self.spc2wav = Spectrogram2Waveform(**vocoder_conf)
+            logging.info(f"Vocoder: {self.spc2wav}")
+        else:
+            self.spc2wav = None
+            logging.info("Vocoder is not used because vocoder_conf is not sufficient")
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        text: Union[str, torch.Tensor, np.ndarray],
+        speech: Union[torch.Tensor, np.ndarray] = None,
+    ):
+        assert check_argument_types()
+
+        if self.use_speech and speech is None:
+            raise RuntimeError("missing required argument: 'speech'")
+
+        if isinstance(text, str):
+            # str -> np.ndarray
+            text = self.preprocess_fn("<dummy>", {"text": text})["text"]
+        batch = {"text": text}
+        if speech is not None:
+            batch["speech"] = speech
+
+        batch = to_device(batch, self.device)
+        outs, outs_denorm, probs, att_ws = self.model.inference(
+            **batch, **self.decode_config
+        )
+
+        if att_ws is not None:
+            duration, focus_rate = self.duration_calculator(att_ws)
+        else:
+            duration, focus_rate = None, None
+
+        if self.spc2wav is not None:
+            wav = torch.tensor(self.spc2wav(outs_denorm.cpu().numpy()))
+        else:
+            wav = None
+
+        return wav, outs, outs_denorm, probs, att_ws, duration, focus_rate
+
+    @property
+    def fs(self) -> Optional[int]:
+        if self.spc2wav is not None:
+            return self.spc2wav.fs
+        else:
+            return None
+
+    @property
+    def use_speech(self) -> bool:
+        """Check whether to require speech in inference.
+
+        Returns:
+            bool: True if speech is required else False.
+
+        """
+        return self.use_teacher_forcing or getattr(self.tts, "use_gst", False)
+
+
 def inference(
     output_dir: str,
     batch_size: int,
@@ -48,9 +189,11 @@ def inference(
     threshold: float,
     minlenratio: float,
     maxlenratio: float,
+    use_teacher_forcing: bool,
     use_att_constraint: bool,
     backward_window: int,
     forward_window: int,
+    speed_control_alpha: float,
     allow_variable_data_keys: bool,
     vocoder_conf: dict,
 ):
@@ -74,75 +217,83 @@ def inference(
     set_all_random_seed(seed)
 
     # 2. Build model
-    model, train_args = TTSTask.build_model_from_file(train_config, model_file, device)
-    model.to(dtype=getattr(torch, dtype)).eval()
-    tts = model.tts
-    normalize = model.normalize
-    logging.info(f"Normalization:\n{normalize}")
-    logging.info(f"TTS:\n{tts}")
+    text2speech = Text2Speech(
+        train_config=train_config,
+        model_file=model_file,
+        threshold=threshold,
+        maxlenratio=maxlenratio,
+        minlenratio=minlenratio,
+        use_teacher_forcing=use_teacher_forcing,
+        use_att_constraint=use_att_constraint,
+        backward_window=backward_window,
+        forward_window=forward_window,
+        speed_control_alpha=speed_control_alpha,
+        vocoder_conf=vocoder_conf,
+        dtype=dtype,
+        device=device,
+    )
 
     # 3. Build data-iterator
+    if not text2speech.use_speech:
+        data_path_and_name_and_type = list(
+            filter(lambda x: x[1] != "speech", data_path_and_name_and_type)
+        )
     loader = TTSTask.build_streaming_iterator(
         data_path_and_name_and_type,
         dtype=dtype,
         batch_size=batch_size,
         key_file=key_file,
         num_workers=num_workers,
-        preprocess_fn=TTSTask.build_preprocess_fn(train_args, False),
-        collate_fn=TTSTask.build_collate_fn(train_args),
+        preprocess_fn=TTSTask.build_preprocess_fn(text2speech.train_args, False),
+        collate_fn=TTSTask.build_collate_fn(text2speech.train_args),
         allow_variable_data_keys=allow_variable_data_keys,
         inference=True,
     )
 
-    # 4. Build converter from spectrogram to waveform
-    if model.feats_extract is not None:
-        vocoder_conf.update(model.feats_extract.get_parameters())
-    if "n_fft" in vocoder_conf and "n_shift" in vocoder_conf and "fs" in vocoder_conf:
-        spc2wav = Spectrogram2Waveform(**vocoder_conf)
-        logging.info(f"Vocoder: {spc2wav}")
-    else:
-        spc2wav = None
-        logging.info("Vocoder is not used because vocoder_conf is not sufficient")
-
-    # 5. Start for-loop
+    # 6. Start for-loop
     output_dir = Path(output_dir)
     (output_dir / "norm").mkdir(parents=True, exist_ok=True)
     (output_dir / "denorm").mkdir(parents=True, exist_ok=True)
+    (output_dir / "speech_shape").mkdir(parents=True, exist_ok=True)
     (output_dir / "wav").mkdir(parents=True, exist_ok=True)
     (output_dir / "att_ws").mkdir(parents=True, exist_ok=True)
     (output_dir / "probs").mkdir(parents=True, exist_ok=True)
+    (output_dir / "durations").mkdir(parents=True, exist_ok=True)
+    (output_dir / "focus_rates").mkdir(parents=True, exist_ok=True)
+
+    # Lazy load to avoid the backend error
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
+    from matplotlib.ticker import MaxNLocator
 
     with NpyScpWriter(
         output_dir / "norm", output_dir / "norm/feats.scp",
-    ) as f, NpyScpWriter(output_dir / "denorm", output_dir / "denorm/feats.scp") as g:
+    ) as norm_writer, NpyScpWriter(
+        output_dir / "denorm", output_dir / "denorm/feats.scp"
+    ) as denorm_writer, open(
+        output_dir / "speech_shape/speech_shape", "w"
+    ) as shape_writer, open(
+        output_dir / "durations/durations", "w"
+    ) as duration_writer, open(
+        output_dir / "focus_rates/focus_rates", "w"
+    ) as focus_rate_writer:
         for idx, (keys, batch) in enumerate(loader, 1):
             assert isinstance(batch, dict), type(batch)
             assert all(isinstance(s, str) for s in keys), keys
             _bs = len(next(iter(batch.values())))
-            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
-            batch = to_device(batch, device)
+            assert _bs == 1, _bs
 
-            key = keys[0]
             # Change to single sequence and remove *_length
             # because inference() requires 1-seq, not mini-batch.
-            _data = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+            batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+
             start_time = time.perf_counter()
+            wav, outs, outs_denorm, probs, att_ws, duration, focus_rate = text2speech(
+                **batch
+            )
 
-            _decode_conf = {
-                "threshold": threshold,
-                "maxlenratio": maxlenratio,
-                "minlenratio": minlenratio,
-            }
-            if isinstance(tts, Tacotron2):
-                _decode_conf.update(
-                    {
-                        "use_att_constraint": use_att_constraint,
-                        "forward_window": forward_window,
-                        "backward_window": backward_window,
-                    }
-                )
-            outs, probs, att_ws = tts.inference(**_data, **_decode_conf)
-            insize = next(iter(_data.values())).size(0) + 1
+            key = keys[0]
+            insize = next(iter(batch.values())).size(0) + 1
             logging.info(
                 "inference speed = {:.1f} frames / sec.".format(
                     int(outs.size(0)) / (time.perf_counter() - start_time)
@@ -151,84 +302,90 @@ def inference(
             logging.info(f"{key} (size:{insize}->{outs.size(0)})")
             if outs.size(0) == insize * maxlenratio:
                 logging.warning(f"output length reaches maximum length ({key}).")
-            f[key] = outs.cpu().numpy()
-
-            # NOTE: normalize.inverse is in-place operation
-            outs_denorm = normalize.inverse(outs[None])[0][0]
-            g[key] = outs_denorm.cpu().numpy()
-
-            # Lazy load to avoid the backend error
-            matplotlib.use("Agg")
-            import matplotlib.pyplot as plt
-            from matplotlib.ticker import MaxNLocator
-
-            # Plot attention weight
-            att_ws = att_ws.cpu().numpy()
-
-            if att_ws.ndim == 2:
-                att_ws = att_ws[None][None]
-            elif att_ws.ndim != 4:
-                raise RuntimeError(f"Must be 2 or 4 dimension: {att_ws.ndim}")
-
-            w, h = plt.figaspect(att_ws.shape[0] / att_ws.shape[1])
-            fig = plt.Figure(
-                figsize=(
-                    w * 1.3 * min(att_ws.shape[0], 2.5),
-                    h * 1.3 * min(att_ws.shape[1], 2.5),
+
+            norm_writer[key] = outs.cpu().numpy()
+            shape_writer.write(f"{key} " + ",".join(map(str, outs.shape)) + "\n")
+
+            denorm_writer[key] = outs_denorm.cpu().numpy()
+
+            if duration is not None:
+                # Save duration and fucus rates
+                duration_writer.write(
+                    f"{key} " + " ".join(map(str, duration.cpu().numpy())) + "\n"
                 )
-            )
-            fig.suptitle(f"{key}")
-            axes = fig.subplots(att_ws.shape[0], att_ws.shape[1])
-            if len(att_ws) == 1:
-                axes = [[axes]]
-            for ax, att_w in zip(axes, att_ws):
-                for ax_, att_w_ in zip(ax, att_w):
-                    ax_.imshow(att_w_.astype(np.float32), aspect="auto")
-                    ax_.set_xlabel("Input")
-                    ax_.set_ylabel("Output")
-                    ax_.xaxis.set_major_locator(MaxNLocator(integer=True))
-                    ax_.yaxis.set_major_locator(MaxNLocator(integer=True))
-
-            fig.tight_layout(rect=[0, 0.03, 1, 0.95])
-            fig.savefig(output_dir / f"att_ws/{key}.png")
-            fig.clf()
-
-            # Plot stop token prediction
-            probs = probs.cpu().numpy()
-
-            fig = plt.Figure()
-            ax = fig.add_subplot(1, 1, 1)
-            ax.plot(probs)
-            ax.set_title(f"{key}")
-            ax.set_xlabel("Output")
-            ax.set_ylabel("Stop probability")
-            ax.set_ylim(0, 1)
-            ax.grid(which="both")
-
-            fig.tight_layout()
-            fig.savefig(output_dir / f"probs/{key}.png")
-            fig.clf()
+                focus_rate_writer.write(f"{key} {float(focus_rate):.5f}\n")
+
+                # Plot attention weight
+                att_ws = att_ws.cpu().numpy()
+
+                if att_ws.ndim == 2:
+                    att_ws = att_ws[None][None]
+                elif att_ws.ndim != 4:
+                    raise RuntimeError(f"Must be 2 or 4 dimension: {att_ws.ndim}")
+
+                w, h = plt.figaspect(att_ws.shape[0] / att_ws.shape[1])
+                fig = plt.Figure(
+                    figsize=(
+                        w * 1.3 * min(att_ws.shape[0], 2.5),
+                        h * 1.3 * min(att_ws.shape[1], 2.5),
+                    )
+                )
+                fig.suptitle(f"{key}")
+                axes = fig.subplots(att_ws.shape[0], att_ws.shape[1])
+                if len(att_ws) == 1:
+                    axes = [[axes]]
+                for ax, att_w in zip(axes, att_ws):
+                    for ax_, att_w_ in zip(ax, att_w):
+                        ax_.imshow(att_w_.astype(np.float32), aspect="auto")
+                        ax_.set_xlabel("Input")
+                        ax_.set_ylabel("Output")
+                        ax_.xaxis.set_major_locator(MaxNLocator(integer=True))
+                        ax_.yaxis.set_major_locator(MaxNLocator(integer=True))
+
+                fig.set_tight_layout({"rect": [0, 0.03, 1, 0.95]})
+                fig.savefig(output_dir / f"att_ws/{key}.png")
+                fig.clf()
+
+            if probs is not None:
+                # Plot stop token prediction
+                probs = probs.cpu().numpy()
+
+                fig = plt.Figure()
+                ax = fig.add_subplot(1, 1, 1)
+                ax.plot(probs)
+                ax.set_title(f"{key}")
+                ax.set_xlabel("Output")
+                ax.set_ylabel("Stop probability")
+                ax.set_ylim(0, 1)
+                ax.grid(which="both")
+
+                fig.set_tight_layout(True)
+                fig.savefig(output_dir / f"probs/{key}.png")
+                fig.clf()
 
             # TODO(kamo): Write scp
-            if spc2wav is not None:
-                wav = spc2wav(outs_denorm.cpu().numpy())
-                sf.write(f"{output_dir}/wav/{key}.wav", wav, spc2wav.fs, "PCM_16")
+            if wav is not None:
+                sf.write(
+                    f"{output_dir}/wav/{key}.wav", wav.numpy(), text2speech.fs, "PCM_16"
+                )
+
+    # remove duration related files if attention is not provided
+    if att_ws is None:
+        shutil.rmtree(output_dir / "att_ws")
+        shutil.rmtree(output_dir / "probs")
+        shutil.rmtree(output_dir / "durations")
+        shutil.rmtree(output_dir / "focus_rates")
 
 
 def get_parser():
     """Get argument parser."""
-    parser = configargparse.ArgumentParser(
+    parser = config_argparse.ArgumentParser(
         description="TTS Decode",
-        config_file_parser_class=configargparse.YAMLConfigFileParser,
-        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
 
     # Note(kamo): Use "_" instead of "-" as separator.
     # "-" is confusing if written in yaml.
-    parser.add_argument(
-        "--config", is_config_file=True, help="config file path",
-    )
-
     parser.add_argument(
         "--log_level",
         type=lambda x: x.upper(),
@@ -318,6 +475,18 @@ def get_parser():
         default=3,
         help="Forward window value in attention constraint",
     )
+    group.add_argument(
+        "--use_teacher_forcing",
+        type=str2bool,
+        default=False,
+        help="Whether to use teacher forcing",
+    )
+    parser.add_argument(
+        "--speed_control_alpha",
+        type=float,
+        default=1.0,
+        help="Alpha in FastSpeech to change the speed of generated speech",
+    )
 
     group = parser.add_argument_group("Grriffin-Lim related")
     group.add_argument(
diff --git a/espnet2/bin/zenodo_upload.py b/espnet2/bin/zenodo_upload.py
new file mode 100755
index 00000000000..b99fb730837
--- /dev/null
+++ b/espnet2/bin/zenodo_upload.py
@@ -0,0 +1,282 @@
+"""Upload files to Zenodo.
+
+You need to do as follows in order to access zenodo:
+
+1. Sign up to Zenodo: https://zenodo.org/
+2. Create access_token: https://zenodo.org/account/settings/applications/tokens/new/
+"""
+
+import argparse
+from datetime import datetime
+from getpass import getpass
+import json
+import os
+from pathlib import Path
+import requests
+from typing import Collection
+from typing import Union
+
+from espnet2.utils import config_argparse
+from espnet2.utils.types import str2bool
+
+
+class Zenodo:
+    """Helper class to invoke Zenodo API
+
+    REST API of zenodo: https://developers.zenodo.org/
+
+    """
+
+    def __init__(self, access_token: str, use_sandbox: bool = False):
+        if use_sandbox:
+            self.zenodo_url = "https://sandbox.zenodo.org"
+        else:
+            self.zenodo_url = "https://zenodo.org"
+
+        self.params = {"access_token": access_token}
+        self.headers = {"Content-Type": "application/json"}
+
+    def create_deposit(self) -> requests.models.Response:
+        r = requests.post(
+            f"{self.zenodo_url}/api/deposit/depositions",
+            params=self.params,
+            json={},
+            headers=self.headers,
+        )
+        if r.status_code != 201:
+            raise RuntimeError(r.json()["message"])
+        return r
+
+    def update_metadata(
+        self, r: Union[requests.models.Response, int], data
+    ) -> requests.models.Response:
+        if isinstance(r, requests.models.Response):
+            deposition_id = r.json()["id"]
+        else:
+            deposition_id = r
+
+        r = requests.put(
+            f"{self.zenodo_url}/api/deposit/depositions/{deposition_id}",
+            params=self.params,
+            data=json.dumps(data),
+            headers=self.headers,
+        )
+        if r.status_code != 200:
+            raise RuntimeError(r.json()["message"])
+        return r
+
+    def upload_file(
+        self, r: Union[requests.models.Response, int], filename: Union[Path, str]
+    ) -> requests.models.Response:
+        if isinstance(r, int):
+            r = requests.get(
+                f"{self.zenodo_url}/api/deposit/depositions/{r}", headers=self.headers
+            )
+
+        bucket_url = r.json()["links"]["bucket"]
+        name = Path(filename).name
+        with open(filename, "rb") as fp:
+            r = requests.put(
+                f"{bucket_url}/{name}",
+                data=fp,
+                # No headers included since it's a raw byte request
+                params=self.params,
+            )
+            if r.status_code != 200:
+                raise RuntimeError(r.json()["message"])
+        return r
+
+    def publish(
+        self, r: Union[requests.models.Response, int]
+    ) -> requests.models.Response:
+        if isinstance(r, requests.models.Response):
+            deposition_id = r.json()["id"]
+        else:
+            deposition_id = r
+
+        r = requests.post(
+            f"{self.zenodo_url}/api/deposit/depositions/"
+            f"{deposition_id}/actions/publish",
+            params=self.params,
+        )
+        if r.status_code != 202:
+            raise RuntimeError(r.json()["message"])
+        return r
+
+
+def upload(
+    access_token: str,
+    title: str,
+    creator_name: str,
+    description: str = "",
+    files: Collection[Union[Path, str]] = (),
+    affiliation: str = None,
+    orcid: str = None,
+    gnd: str = None,
+    upload_type: str = "other",
+    license: str = "CC-BY-4.0",
+    keywords: Collection[str] = (),
+    related_identifiers: Collection[dict] = (),
+    community_identifer: str = "espnet",
+    use_sandbox: bool = True,
+    publish: bool = False,
+):
+    zenodo = Zenodo(access_token, use_sandbox=use_sandbox)
+    r = zenodo.create_deposit()
+
+    # Update metatdata using old API
+    creator = {"name": creator_name}
+    if affiliation is not None:
+        creator["affiliation"] = affiliation
+    if orcid is not None:
+        creator["orcid"] = orcid
+    if gnd is not None:
+        creator["gnd"] = gnd
+    data = {
+        "metadata": {
+            "upload_type": upload_type,
+            "publication_date": datetime.now().strftime("%Y-%m-%d"),
+            "title": title,
+            "description": description,
+            "creators": [creator],
+            "communities": [{"identifier": community_identifer}],
+            "license": license,
+            "keywords": list(keywords),
+            "related_identifiers": list(related_identifiers),
+        }
+    }
+    zenodo.update_metadata(r, data)
+
+    # Upload files using new API
+    for f in files:
+        # Check file existing
+        if not Path(f).exists():
+            raise FileNotFoundError(f"{f} is not found")
+    for f in files:
+        print(f"Now uploading {f}...")
+        zenodo.upload_file(r, f)
+
+    if publish:
+        r = zenodo.publish(r)
+        url = r.json()["links"]["latest_html"]
+        print(f"Successfully published. Go to {url}")
+    else:
+        url = r.json()["links"]["html"]
+        print(f"Successfully uploaded, but not published yet. Go to {url}")
+
+
+def upload_espnet_model(
+    access_token: str,
+    title: str,
+    creator_name: str,
+    file: Collection[Union[Path, str]] = (),
+    description: str = "",
+    description_file: str = None,
+    affiliation: str = None,
+    license: str = "CC-BY-4.0",
+    orcid: str = None,
+    gnd: str = None,
+    use_sandbox: bool = False,
+    publish: bool = False,
+):
+    if description_file is not None:
+        with open(description_file, "r", encoding="utf-8") as f:
+            description = f.read()
+
+    upload(
+        access_token=access_token,
+        title=title,
+        description=description,
+        creator_name=creator_name,
+        files=file,
+        keywords=[
+            "ESPnet",
+            "deep-learning",
+            "python",
+            "pytorch",
+            "speech-recognition",
+            "speech-synthesis",
+            "speech-translation",
+            "machine-translation",
+        ],
+        related_identifiers=[
+            {
+                "relation": "isSupplementTo",
+                "identifier": "https://github.com/espnet/espnet",
+            }
+        ],
+        affiliation=affiliation,
+        license=license,
+        orcid=orcid,
+        gnd=gnd,
+        use_sandbox=use_sandbox,
+        publish=publish,
+    )
+
+
+def get_parser():
+    parser = config_argparse.ArgumentParser(
+        description="Upload files to Zenodo",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--access_token",
+        help="Get your access_token from "
+        "https://zenodo.org/account/settings/applications/ or  "
+        "https://sandbox.zenodo.org/account/settings/applications/ . "
+        "You can also give it from an environment variable 'ACCESS_TOKEN'",
+    )
+    parser.add_argument(
+        "--title",
+        required=True,
+        help="e.g. ESPnet pretrained model, MT, "
+        "Fisher-CallHome Spanish (Es->En), Transformer",
+    )
+    parser.add_argument("--creator_name", required=True, help="Your name")
+    parser.add_argument("--file", nargs="+", required=True)
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--description", help="Give the description")
+    group.add_argument("--description_file", help="Give the description from file")
+    parser.add_argument(
+        "--use_sandbox",
+        type=str2bool,
+        default=False,
+        help="Use zenodo sandbox for testing",
+    )
+    parser.add_argument(
+        "--publish", type=str2bool, default=False, help="Publish after uploading"
+    )
+    parser.add_argument("--license", default="CC-BY-4.0")
+    parser.add_argument("--affiliation")
+    parser.add_argument("--orcid")
+    parser.add_argument("--gnd")
+    return parser
+
+
+def main(cmd=None):
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+
+    # If --access_token is not given, get from "ACCESS_TOKEN"
+    if args.access_token is None:
+        args.access_token = os.environ.get("ACCESS_TOKEN")
+
+    # If neither is given, input from stdin
+    if args.access_token is None:
+        if args.use_sandbox:
+            zenodo_url = "https://sandbox.zenodo.org"
+        else:
+            zenodo_url = "https://zenodo.org"
+        args.access_token = getpass(
+            "Input Zenodo API Token\n"
+            "(You can create it from "
+            f"{zenodo_url}/account/settings/applications/tokens/new/): "
+        )
+
+    kwargs = vars(args)
+    kwargs.pop("config")
+    upload_espnet_model(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/main_funcs/pack_funcs.py b/espnet2/main_funcs/pack_funcs.py
index 0690c8dfc64..1989407e957 100644
--- a/espnet2/main_funcs/pack_funcs.py
+++ b/espnet2/main_funcs/pack_funcs.py
@@ -9,10 +9,121 @@
 from typing import Dict
 from typing import Iterable
 from typing import Union
+import zipfile
 
 import yaml
 
-DIRNAME = Path("packed")
+
+class Archiver:
+    def __init__(self, file, mode="r"):
+        if Path(file).suffix == ".tar":
+            self.type = "tar"
+        elif Path(file).suffix == ".tgz" or Path(file).suffixes == [".tar", ".gz"]:
+            self.type = "tar"
+            if mode == "w":
+                mode = "w:gz"
+        elif Path(file).suffix == ".tbz2" or Path(file).suffixes == [".tar", ".bz2"]:
+            self.type = "tar"
+            if mode == "w":
+                mode = "w:bz2"
+        elif Path(file).suffix == ".txz" or Path(file).suffixes == [".tar", ".xz"]:
+            self.type = "tar"
+            if mode == "w":
+                mode = "w:xz"
+        elif Path(file).suffix == ".zip":
+            self.type = "zip"
+        else:
+            raise ValueError(f"Cannot detect archive format: type={file}")
+
+        if self.type == "tar":
+            self.fopen = tarfile.open(file, mode=mode)
+        elif self.type == "zip":
+            self.fopen = zipfile.ZipFile(file, mode=mode)
+        else:
+            raise ValueError(f"Not supported: type={type}")
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.fopen.close()
+
+    def close(self):
+        self.fopen.close()
+
+    def __iter__(self):
+        if self.type == "tar":
+            return iter(self.fopen)
+        elif self.type == "zip":
+            return iter(self.fopen.infolist())
+        else:
+            raise ValueError(f"Not supported: type={self.type}")
+
+    def add(self, filename, arcname=None):
+        if self.type == "tar":
+            return self.fopen.add(filename, arcname)
+        elif self.type == "zip":
+            return self.fopen.write(filename, arcname)
+        else:
+            raise ValueError(f"Not supported: type={self.type}")
+
+    def addfile(self, info, fileobj):
+        if self.type == "tar":
+            return self.fopen.addfile(info, fileobj)
+        elif self.type == "zip":
+            return self.fopen.writestr(info, fileobj.read())
+        else:
+            raise ValueError(f"Not supported: type={self.type}")
+
+    def generate_info(self, name, size) -> Union[tarfile.TarInfo, zipfile.ZipInfo]:
+        """Generate TarInfo using system information"""
+        if self.type == "tar":
+            tarinfo = tarfile.TarInfo(str(name))
+            if os.name == "posix":
+                tarinfo.gid = os.getgid()
+                tarinfo.uid = os.getuid()
+            tarinfo.mtime = datetime.now().timestamp()
+            tarinfo.size = size
+            # Keep mode as default
+            return tarinfo
+        elif self.type == "zip":
+            zipinfo = zipfile.ZipInfo(str(name), datetime.now().timetuple()[:6])
+            zipinfo.file_size = size
+            return zipinfo
+        else:
+            raise ValueError(f"Not supported: type={self.type}")
+
+    def get_name_from_info(self, info):
+        if self.type == "tar":
+            assert isinstance(info, tarfile.TarInfo), type(info)
+            return info.name
+        elif self.type == "zip":
+            assert isinstance(info, zipfile.ZipInfo), type(info)
+            return info.filename
+        else:
+            raise ValueError(f"Not supported: type={self.type}")
+
+    def extract(self, info, path=None):
+        if self.type == "tar":
+            return self.fopen.extract(info, path)
+        elif self.type == "zip":
+            return self.fopen.extract(info, path)
+        else:
+            raise ValueError(f"Not supported: type={self.type}")
+
+    def extractfile(self, info, mode="r"):
+        if self.type == "tar":
+            f = self.fopen.extractfile(info)
+            if mode == "r":
+                return TextIOWrapper(f)
+            else:
+                return f
+        elif self.type == "zip":
+            if mode == "rb":
+                mode = "r"
+            return self.fopen.open(info, mode)
+        else:
+            raise ValueError(f"Not supported: type={self.type}")
 
 
 def find_path_and_change_it_recursive(value, src: str, tgt: str):
@@ -28,19 +139,8 @@ def find_path_and_change_it_recursive(value, src: str, tgt: str):
         return value
 
 
-def default_tarinfo(name) -> tarfile.TarInfo:
-    """Generate TarInfo using system information"""
-    tarinfo = tarfile.TarInfo(str(name))
-    if os.name == "posix":
-        tarinfo.gid = os.getgid()
-        tarinfo.uid = os.getuid()
-    tarinfo.mtime = datetime.now().timestamp()
-    # Keep mode as default
-    return tarinfo
-
-
 def unpack(
-    input_tarfile: Union[Path, str], outpath: Union[Path, str]
+    input_tarfile: Union[Path, str], outpath: Union[Path, str],
 ) -> Dict[str, str]:
     """Scan all files in the archive file and return as a dict of files.
 
@@ -57,32 +157,31 @@ def unpack(
     input_tarfile = Path(input_tarfile)
     outpath = Path(outpath)
 
-    with tarfile.open(input_tarfile) as tar:
-        for tarinfo in tar:
-            if tarinfo.name == str(DIRNAME / "meta.yaml"):
-                d = yaml.safe_load(TextIOWrapper(tar.extractfile(tarinfo)))
+    with Archiver(input_tarfile) as archive:
+        for info in archive:
+            if Path(archive.get_name_from_info(info)).name == "meta.yaml":
+                d = yaml.safe_load(archive.extractfile(info))
                 yaml_files = d["yaml_files"]
                 break
         else:
             raise RuntimeError("Format error: not found meta.yaml")
 
         retval = defaultdict(list)
-        for tarinfo in tar:
-            outname = outpath / tarinfo.name
+        for info in archive:
+            outname = outpath / archive.get_name_from_info(info)
             outname.parent.mkdir(parents=True, exist_ok=True)
-            if tarinfo.name in yaml_files:
-                d = yaml.safe_load(TextIOWrapper(tar.extractfile(tarinfo)))
+            if archive.get_name_from_info(info) in yaml_files:
+                d = yaml.safe_load(archive.extractfile(info))
                 # Rewrite yaml
-                for tarinfo2 in tar:
-                    d = find_path_and_change_it_recursive(
-                        d, tarinfo2.name, str(outpath / tarinfo2.name)
-                    )
+                for info2 in archive:
+                    name = archive.get_name_from_info(info2)
+                    d = find_path_and_change_it_recursive(d, name, str(outpath / name))
                 with outname.open("w", encoding="utf-8") as f:
                     yaml.safe_dump(d, f)
             else:
-                tar.extract(tarinfo, path=outpath)
+                archive.extract(info, path=outpath)
 
-            key = tarinfo.name.split("/")[1]
+            key = archive.get_name_from_info(info).split("/")[1]
             key = Path(key).stem
             retval[key].append(str(outname))
         retval = {k: v[0] if len(v) == 1 else v for k, v in retval.items()}
@@ -94,16 +193,17 @@ def pack(
     yaml_files: Dict[str, Union[str, Path]],
     outpath: Union[str, Path],
     option: Iterable[Union[str, Path]] = (),
-    mode: str = "w:gz",
+    dirname: str = "packed",
 ):
     for v in list(files.values()) + list(yaml_files.values()) + list(option):
         if not Path(v).exists():
             raise FileNotFoundError(f"No such file or directory: {v}")
+    dirname = Path(dirname)
 
     files_map = {}
     for name, src in list(files.items()):
         # Save as e.g. packed/asr_model_file.pth
-        dst = str(DIRNAME / name)
+        dst = str(dirname / name)
         files_map[dst] = src
 
     for src in option:
@@ -112,9 +212,9 @@ def pack(
         while True:
             p = Path(src)
             if idx == 0:
-                dst = str(DIRNAME / "option" / p.name)
+                dst = str(dirname / "option" / p.name)
             else:
-                dst = str(DIRNAME / "option" / f"{p.stem}.{idx}{p.suffix}")
+                dst = str(dirname / "option" / f"{p.stem}.{idx}{p.suffix}")
             if dst not in files_map:
                 files_map[dst] = src
                 break
@@ -127,7 +227,7 @@ def pack(
             dic = yaml.safe_load(f)
             for dst, src in files_map.items():
                 dic = find_path_and_change_it_recursive(dic, src, dst)
-            dst = str(DIRNAME / name)
+            dst = str(dirname / name)
             yaml_files_map[dst] = dic
 
     meta_objs = dict(
@@ -151,20 +251,18 @@ def pack(
         pass
 
     Path(outpath).parent.mkdir(parents=True, exist_ok=True)
-    with tarfile.open(outpath, mode=mode) as tar:
+    with Archiver(outpath, mode="w") as archive:
         # Write packed/meta.yaml
         fileobj = BytesIO(yaml.safe_dump(meta_objs).encode())
-        tarinfo = default_tarinfo(DIRNAME / "meta.yaml")
-        tarinfo.size = fileobj.getbuffer().nbytes
-        tar.addfile(tarinfo, fileobj=fileobj)
+        info = archive.generate_info(dirname / "meta.yaml", fileobj.getbuffer().nbytes)
+        archive.addfile(info, fileobj=fileobj)
 
         for dst, dic in yaml_files_map.items():
             # Dump dict as yaml-bytes
             fileobj = BytesIO(yaml.safe_dump(dic).encode())
             # Embed the yaml-bytes in tarfile
-            tarinfo = default_tarinfo(dst)
-            tarinfo.size = fileobj.getbuffer().nbytes
-            tar.addfile(tarinfo, fileobj=fileobj)
+            info = archive.generate_info(dst, fileobj.getbuffer().nbytes)
+            archive.addfile(info, fileobj=fileobj)
         for dst, src in files_map.items():
             # Resolve to avoid symbolic link
-            tar.add(Path(src).resolve(), dst)
+            archive.add(Path(src).resolve(), dst)
diff --git a/espnet2/tasks/abs_task.py b/espnet2/tasks/abs_task.py
index 95bbc35c602..bcf287a13f7 100644
--- a/espnet2/tasks/abs_task.py
+++ b/espnet2/tasks/abs_task.py
@@ -16,7 +16,6 @@
 from typing import Tuple
 from typing import Union
 
-import configargparse
 import humanfriendly
 import numpy as np
 import torch
@@ -60,6 +59,7 @@
 from espnet2.train.reporter import Reporter
 from espnet2.train.trainer import Trainer
 from espnet2.utils.build_dataclass import build_dataclass
+from espnet2.utils import config_argparse
 from espnet2.utils.get_default_kwargs import get_default_kwargs
 from espnet2.utils.nested_dict_action import NestedDictAction
 from espnet2.utils.types import humanfriendly_parse_size_or_none
@@ -229,18 +229,16 @@ def build_model(cls, args: argparse.Namespace) -> AbsESPnetModel:
         raise NotImplementedError
 
     @classmethod
-    def get_parser(cls) -> configargparse.ArgumentParser:
+    def get_parser(cls) -> config_argparse.ArgumentParser:
         assert check_argument_types()
 
         class ArgumentDefaultsRawTextHelpFormatter(
-            configargparse.RawTextHelpFormatter,
-            configargparse.ArgumentDefaultsHelpFormatter,
+            argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter,
         ):
             pass
 
-        parser = configargparse.ArgumentParser(
+        parser = config_argparse.ArgumentParser(
             description="base parser",
-            config_file_parser_class=configargparse.YAMLConfigFileParser,
             formatter_class=ArgumentDefaultsRawTextHelpFormatter,
         )
 
@@ -253,7 +251,6 @@ class ArgumentDefaultsRawTextHelpFormatter(
 
         group = parser.add_argument_group("Common configuration")
 
-        group.add_argument("--config", is_config_file=True, help="config file path")
         group.add_argument(
             "--print_config",
             action="store_true",
diff --git a/espnet2/tasks/asr.py b/espnet2/tasks/asr.py
index bdc641190a8..96635d2f54c 100644
--- a/espnet2/tasks/asr.py
+++ b/espnet2/tasks/asr.py
@@ -151,7 +151,7 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
         group.add_argument(
             "--use_preprocessor",
             type=str2bool,
-            default=False,
+            default=True,
             help="Apply preprocessing to data or not",
         )
         group.add_argument(
diff --git a/espnet2/tasks/lm.py b/espnet2/tasks/lm.py
index c8f00414b36..e94dbbd70a1 100644
--- a/espnet2/tasks/lm.py
+++ b/espnet2/tasks/lm.py
@@ -84,7 +84,7 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
         group.add_argument(
             "--use_preprocessor",
             type=str2bool,
-            default=False,
+            default=True,
             help="Apply preprocessing to data or not",
         )
         group.add_argument(
diff --git a/espnet2/tasks/tts.py b/espnet2/tasks/tts.py
index 9132ed97301..e7aa2994cf8 100644
--- a/espnet2/tasks/tts.py
+++ b/espnet2/tasks/tts.py
@@ -21,6 +21,7 @@
 from espnet2.train.trainer import Trainer
 from espnet2.tts.abs_tts import AbsTTS
 from espnet2.tts.espnet_model import ESPnetTTSModel
+from espnet2.tts.fastspeech import FastSpeech
 from espnet2.tts.feats_extract.abs_feats_extract import AbsFeatsExtract
 from espnet2.tts.feats_extract.log_mel_fbank import LogMelFbank
 from espnet2.tts.feats_extract.log_spectrogram import LogSpectrogram
@@ -47,7 +48,7 @@
 )
 tts_choices = ClassChoices(
     "tts",
-    classes=dict(tacotron2=Tacotron2, transformer=Transformer),
+    classes=dict(tacotron2=Tacotron2, transformer=Transformer, fastspeech=FastSpeech),
     type_check=AbsTTS,
     default="tacotron2",
 )
@@ -104,7 +105,7 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
         group.add_argument(
             "--use_preprocessor",
             type=str2bool,
-            default=False,
+            default=True,
             help="Apply preprocessing to data or not",
         )
         group.add_argument(
@@ -197,10 +198,10 @@ def required_data_names(cls, inference: bool = False) -> Tuple[str, ...]:
     @classmethod
     def optional_data_names(cls, inference: bool = False) -> Tuple[str, ...]:
         if not inference:
-            retval = ("spembs", "spcs")
+            retval = ("spembs", "durations")
         else:
             # Inference mode
-            retval = ("spembs",)
+            retval = ("spembs", "speech")
         return retval
 
     @classmethod
diff --git a/espnet2/tts/abs_tts.py b/espnet2/tts/abs_tts.py
index 488646f6fbf..4af7c61ed08 100644
--- a/espnet2/tts/abs_tts.py
+++ b/espnet2/tts/abs_tts.py
@@ -22,11 +22,6 @@ def forward(
 
     @abstractmethod
     def inference(
-        self,
-        text: torch.Tensor,
-        threshold: float,
-        minlenratio: float,
-        maxlenratio: float,
-        spembs: torch.Tensor = None,
+        self, text: torch.Tensor, spembs: torch.Tensor = None, **kwargs,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         raise NotImplementedError
diff --git a/espnet2/tts/duration_calculator.py b/espnet2/tts/duration_calculator.py
new file mode 100644
index 00000000000..82a31498e5f
--- /dev/null
+++ b/espnet2/tts/duration_calculator.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Duration calculator for ESPnet2."""
+
+from typing import Tuple
+
+import torch
+
+
+class DurationCalculator(torch.nn.Module):
+    """Duration calculator module."""
+
+    def __init__(self):
+        """Initilize duration calculator."""
+        super().__init__()
+
+    @torch.no_grad()
+    def forward(self, att_ws: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Convert attention weight to durations.
+
+        Args:
+            att_ws (Tesnor): Attention weight tensor (L, T) or (#layers, #heads, L, T).
+
+        Returns:
+            LongTensor: Duration of each input (T,).
+            Tensor: Focus rate value.
+
+        """
+        duration = self._calculate_duration(att_ws)
+        focus_rate = self._calculate_focus_rete(att_ws)
+
+        return duration, focus_rate
+
+    @staticmethod
+    def _calculate_focus_rete(att_ws):
+        if len(att_ws.shape) == 2:
+            # tacotron 2 case -> (L, T)
+            return att_ws.max(dim=-1)[0].mean()
+        elif len(att_ws.shape) == 4:
+            # transformer case -> (#layers, #heads, L, T)
+            return att_ws.max(dim=-1)[0].mean(dim=-1).max()
+        else:
+            raise ValueError("att_ws should be 2 or 4 dimensional tensor.")
+
+    @staticmethod
+    def _calculate_duration(att_ws):
+        if len(att_ws.shape) == 2:
+            # tacotron 2 case -> (L, T)
+            pass
+        elif len(att_ws.shape) == 4:
+            # transformer case -> (#layers, #heads, L, T)
+            # get the most diagonal head according to focus rate
+            att_ws = torch.cat(
+                [att_w for att_w in att_ws], dim=0
+            )  # (#heads * #layers, L, T)
+            diagonal_scores = att_ws.max(dim=-1)[0].mean(dim=-1)  # (#heads * #layers,)
+            diagonal_head_idx = diagonal_scores.argmax()
+            att_ws = att_ws[diagonal_head_idx]  # (L, T)
+        else:
+            raise ValueError("att_ws should be 2 or 4 dimensional tensor.")
+        # calculate duration from 2d attention weight
+        durations = torch.stack(
+            [att_ws.argmax(-1).eq(i).sum() for i in range(att_ws.shape[1])]
+        )
+        return durations.view(-1)
diff --git a/espnet2/tts/espnet_model.py b/espnet2/tts/espnet_model.py
index 3a6d8518a97..e2a92c7206b 100644
--- a/espnet2/tts/espnet_model.py
+++ b/espnet2/tts/espnet_model.py
@@ -9,7 +9,10 @@
 from espnet2.layers.inversible_interface import InversibleInterface
 from espnet2.train.abs_espnet_model import AbsESPnetModel
 from espnet2.tts.abs_tts import AbsTTS
+from espnet2.tts.fastspeech import FastSpeech
 from espnet2.tts.feats_extract.abs_feats_extract import AbsFeatsExtract
+from espnet2.tts.tacotron2 import Tacotron2
+from espnet2.tts.transformer import Transformer
 
 
 class ESPnetTTSModel(AbsESPnetModel):
@@ -69,3 +72,56 @@ def _extract_feats(
         else:
             feats, feats_lengths = speech, speech_lengths
         return feats, feats_lengths
+
+    def inference(
+        self,
+        text: torch.Tensor,
+        spembs: torch.Tensor = None,
+        speech: torch.Tensor = None,
+        threshold: float = 0.5,
+        minlenratio: float = 0.0,
+        maxlenratio: float = 10.0,
+        use_teacher_forcing: bool = False,
+        use_att_constraint: bool = False,
+        backward_window: int = 1,
+        forward_window: int = 3,
+        speed_control_alpha: float = 1.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        kwargs = {}
+
+        if isinstance(self.tts, (Tacotron2, Transformer)):
+            kwargs.update(
+                {
+                    "threshold": threshold,
+                    "maxlenratio": maxlenratio,
+                    "minlenratio": minlenratio,
+                    "use_teacher_forcing": use_teacher_forcing,
+                }
+            )
+        if isinstance(self.tts, Tacotron2):
+            kwargs.update(
+                {
+                    "use_att_constraint": use_att_constraint,
+                    "forward_window": forward_window,
+                    "backward_window": backward_window,
+                }
+            )
+        if isinstance(self.tts, FastSpeech):
+            kwargs.update({"alpha": speed_control_alpha})
+
+        if use_teacher_forcing or getattr(self.tts, "use_gst", False):
+            if speech is None:
+                raise RuntimeError("missing required argument: 'speech'")
+            if self.feats_extract is not None:
+                speech = self.feats_extract(speech[None])[0][0]
+            if self.normalize is not None:
+                speech = self.normalize(speech[None])[0][0]
+            kwargs["speech"] = speech
+        outs, probs, att_ws = self.tts.inference(text=text, spembs=spembs, **kwargs)
+
+        if self.normalize is not None:
+            # NOTE: normalize.inverse is in-place operation
+            outs_denorm = self.normalize.inverse(outs[None])[0][0]
+        else:
+            outs_denorm = outs
+        return outs, outs_denorm, probs, att_ws
diff --git a/espnet2/tts/fastspeech.py b/espnet2/tts/fastspeech.py
new file mode 100644
index 00000000000..b168436d6d3
--- /dev/null
+++ b/espnet2/tts/fastspeech.py
@@ -0,0 +1,524 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Fastspeech related modules for ESPnet2."""
+
+from typing import Dict
+from typing import Sequence
+from typing import Tuple
+
+import torch
+import torch.nn.functional as F
+from typeguard import check_argument_types
+
+from espnet.nets.pytorch_backend.e2e_tts_fastspeech import FeedForwardTransformerLoss
+from espnet.nets.pytorch_backend.fastspeech.duration_predictor import DurationPredictor
+from espnet.nets.pytorch_backend.fastspeech.length_regulator import LengthRegulator
+from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
+from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet.nets.pytorch_backend.tacotron2.decoder import Postnet
+from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
+from espnet.nets.pytorch_backend.transformer.embedding import ScaledPositionalEncoding
+from espnet.nets.pytorch_backend.transformer.encoder import Encoder
+
+from espnet2.torch_utils.device_funcs import force_gatherable
+from espnet2.torch_utils.initialize import initialize
+from espnet2.tts.abs_tts import AbsTTS
+from espnet2.tts.gst.style_encoder import StyleEncoder
+
+
+class FastSpeech(AbsTTS):
+    """FastSpeech module for end-to-end text-to-speech.
+
+    This is a module of FastSpeech, feed-forward Transformer with duration predictor
+    described in `FastSpeech: Fast, Robust and Controllable Text to Speech`_, which
+    does not require any auto-regressive processing during inference, resulting in
+    fast decoding compared with auto-regressive Transformer.
+
+    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
+        https://arxiv.org/pdf/1905.09263.pdf
+
+    Args:
+        idim (int): Dimension of the inputs.
+        odim (int): Dimension of the outputs.
+        elayers (int, optional): Number of encoder layers.
+        eunits (int, optional): Number of encoder hidden units.
+        dlayers (int, optional): Number of decoder layers.
+        dunits (int, optional): Number of decoder hidden units.
+        use_scaled_pos_enc (bool, optional):
+            Whether to use trainable scaled positional encoding.
+        encoder_normalize_before (bool, optional):
+            Whether to perform layer normalization before encoder block.
+        decoder_normalize_before (bool, optional):
+            Whether to perform layer normalization before decoder block.
+        encoder_concat_after (bool, optional): Whether to concatenate attention
+            layer's input and output in encoder.
+        decoder_concat_after (bool, optional): Whether to concatenate attention
+            layer's input and output in decoder.
+        duration_predictor_layers (int, optional): Number of duration predictor layers.
+        duration_predictor_chans (int, optional): Number of duration predictor channels.
+        duration_predictor_kernel_size (int, optional):
+            Kernel size of duration predictor.
+        spk_embed_dim (int, optional): Number of speaker embedding dimensions.
+        spk_embed_integration_type: How to integrate speaker embedding.
+        use_gst (str, optional): Whether to use global style token.
+        gst_tokens (int, optional): The number of GST embeddings.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
+        gst_conv_layers (int, optional): The number of conv layers in GST.
+        gst_conv_chans_list: (Sequence[int], optional):
+            List of the number of channels of conv layers in GST.
+        gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST.
+        gst_conv_stride (int, optional): Stride size of conv layers in GST.
+        gst_gru_layers (int, optional): The number of GRU layers in GST.
+        gst_gru_units (int, optional): The number of GRU units in GST.
+        reduction_factor (int, optional): Reduction factor.
+        transformer_enc_dropout_rate (float, optional):
+            Dropout rate in encoder except attention & positional encoding.
+        transformer_enc_positional_dropout_rate (float, optional):
+            Dropout rate after encoder positional encoding.
+        transformer_enc_attn_dropout_rate (float, optional):
+            Dropout rate in encoder self-attention module.
+        transformer_dec_dropout_rate (float, optional):
+            Dropout rate in decoder except attention & positional encoding.
+        transformer_dec_positional_dropout_rate (float, optional):
+            Dropout rate after decoder positional encoding.
+        transformer_dec_attn_dropout_rate (float, optional):
+            Dropout rate in deocoder self-attention module.
+        init_type (str, optional):
+            How to initialize transformer parameters.
+        init_enc_alpha (float, optional):
+            Initial value of alpha in scaled pos encoding of the encoder.
+        init_dec_alpha (float, optional):
+            Initial value of alpha in scaled pos encoding of the decoder.
+        use_masking (bool, optional):
+            Whether to apply masking for padded part in loss calculation.
+        use_weighted_masking (bool, optional):
+            Whether to apply weighted masking in loss calculation.
+
+    """
+
+    def __init__(
+        self,
+        # network structure related
+        idim: int,
+        odim: int,
+        adim: int = 384,
+        aheads: int = 4,
+        elayers: int = 6,
+        eunits: int = 1536,
+        dlayers: int = 6,
+        dunits: int = 1536,
+        postnet_layers: int = 5,
+        postnet_chans: int = 512,
+        postnet_filts: int = 5,
+        positionwise_layer_type: str = "conv1d",
+        positionwise_conv_kernel_size: int = 1,
+        use_scaled_pos_enc: bool = True,
+        use_batch_norm: bool = True,
+        encoder_normalize_before: bool = False,
+        decoder_normalize_before: bool = False,
+        encoder_concat_after: bool = False,
+        decoder_concat_after: bool = False,
+        duration_predictor_layers: int = 2,
+        duration_predictor_chans: int = 384,
+        duration_predictor_kernel_size: int = 3,
+        reduction_factor: int = 1,
+        spk_embed_dim: int = None,
+        spk_embed_integration_type: str = "add",
+        use_gst: bool = False,
+        gst_tokens: int = 10,
+        gst_heads: int = 4,
+        gst_conv_layers: int = 6,
+        gst_conv_chans_list: Sequence[int] = (32, 32, 64, 64, 128, 128),
+        gst_conv_kernel_size: int = 3,
+        gst_conv_stride: int = 2,
+        gst_gru_layers: int = 1,
+        gst_gru_units: int = 128,
+        # training related
+        transformer_enc_dropout_rate: float = 0.1,
+        transformer_enc_positional_dropout_rate: float = 0.1,
+        transformer_enc_attn_dropout_rate: float = 0.1,
+        transformer_dec_dropout_rate: float = 0.1,
+        transformer_dec_positional_dropout_rate: float = 0.1,
+        transformer_dec_attn_dropout_rate: float = 0.1,
+        duration_predictor_dropout_rate: float = 0.1,
+        postnet_dropout_rate: float = 0.5,
+        init_type: str = "xavier_uniform",
+        init_enc_alpha: float = 1.0,
+        init_dec_alpha: float = 1.0,
+        use_masking: bool = False,
+        use_weighted_masking: bool = False,
+    ):
+        """Initialize FastSpeech module."""
+        assert check_argument_types()
+        super().__init__()
+
+        # store hyperparameters
+        self.idim = idim
+        self.odim = odim
+        self.eos = idim - 1
+        self.reduction_factor = reduction_factor
+        self.use_scaled_pos_enc = use_scaled_pos_enc
+        self.use_gst = use_gst
+        self.spk_embed_dim = spk_embed_dim
+        if self.spk_embed_dim is not None:
+            self.spk_embed_integration_type = spk_embed_integration_type
+
+        # use idx 0 as padding idx
+        self.padding_idx = 0
+
+        # get positional encoding class
+        pos_enc_class = (
+            ScaledPositionalEncoding if self.use_scaled_pos_enc else PositionalEncoding
+        )
+
+        # define encoder
+        encoder_input_layer = torch.nn.Embedding(
+            num_embeddings=idim, embedding_dim=adim, padding_idx=self.padding_idx
+        )
+        self.encoder = Encoder(
+            idim=idim,
+            attention_dim=adim,
+            attention_heads=aheads,
+            linear_units=eunits,
+            num_blocks=elayers,
+            input_layer=encoder_input_layer,
+            dropout_rate=transformer_enc_dropout_rate,
+            positional_dropout_rate=transformer_enc_positional_dropout_rate,
+            attention_dropout_rate=transformer_enc_attn_dropout_rate,
+            pos_enc_class=pos_enc_class,
+            normalize_before=encoder_normalize_before,
+            concat_after=encoder_concat_after,
+            positionwise_layer_type=positionwise_layer_type,
+            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+        )
+
+        # define GST
+        if self.use_gst:
+            self.gst = StyleEncoder(
+                idim=odim,  # the input is mel-spectrogram
+                gst_tokens=gst_tokens,
+                gst_token_dim=adim,
+                gst_heads=gst_heads,
+                conv_layers=gst_conv_layers,
+                conv_chans_list=gst_conv_chans_list,
+                conv_kernel_size=gst_conv_kernel_size,
+                conv_stride=gst_conv_stride,
+                gru_layers=gst_gru_layers,
+                gru_units=gst_gru_units,
+            )
+
+        # define additional projection for speaker embedding
+        if self.spk_embed_dim is not None:
+            if self.spk_embed_integration_type == "add":
+                self.projection = torch.nn.Linear(self.spk_embed_dim, adim)
+            else:
+                self.projection = torch.nn.Linear(adim + self.spk_embed_dim, adim)
+
+        # define duration predictor
+        self.duration_predictor = DurationPredictor(
+            idim=adim,
+            n_layers=duration_predictor_layers,
+            n_chans=duration_predictor_chans,
+            kernel_size=duration_predictor_kernel_size,
+            dropout_rate=duration_predictor_dropout_rate,
+        )
+
+        # define length regulator
+        self.length_regulator = LengthRegulator()
+
+        # define decoder
+        # NOTE: we use encoder as decoder
+        # because fastspeech's decoder is the same as encoder
+        self.decoder = Encoder(
+            idim=0,
+            attention_dim=adim,
+            attention_heads=aheads,
+            linear_units=dunits,
+            num_blocks=dlayers,
+            input_layer=None,
+            dropout_rate=transformer_dec_dropout_rate,
+            positional_dropout_rate=transformer_dec_positional_dropout_rate,
+            attention_dropout_rate=transformer_dec_attn_dropout_rate,
+            pos_enc_class=pos_enc_class,
+            normalize_before=decoder_normalize_before,
+            concat_after=decoder_concat_after,
+            positionwise_layer_type=positionwise_layer_type,
+            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+        )
+
+        # define final projection
+        self.feat_out = torch.nn.Linear(adim, odim * reduction_factor)
+
+        # define postnet
+        self.postnet = (
+            None
+            if postnet_layers == 0
+            else Postnet(
+                idim=idim,
+                odim=odim,
+                n_layers=postnet_layers,
+                n_chans=postnet_chans,
+                n_filts=postnet_filts,
+                use_batch_norm=use_batch_norm,
+                dropout_rate=postnet_dropout_rate,
+            )
+        )
+
+        # initialize parameters
+        self._reset_parameters(
+            init_type=init_type,
+            init_enc_alpha=init_enc_alpha,
+            init_dec_alpha=init_dec_alpha,
+        )
+
+        # define criterions
+        self.criterion = FeedForwardTransformerLoss(
+            use_masking=use_masking, use_weighted_masking=use_weighted_masking
+        )
+
+    def _forward(
+        self,
+        xs: torch.Tensor,
+        ilens: torch.Tensor,
+        ys: torch.Tensor = None,
+        olens: torch.Tensor = None,
+        ds: torch.Tensor = None,
+        spembs: torch.Tensor = None,
+        is_inference: bool = False,
+        alpha: float = 1.0,
+    ) -> Sequence[torch.Tensor]:
+        # forward encoder
+        x_masks = self._source_mask(ilens)
+        hs, _ = self.encoder(xs, x_masks)  # (B, Tmax, adim)
+
+        # integrate with GST
+        if self.use_gst:
+            style_embs = self.gst(ys)
+            hs = hs + style_embs.unsqueeze(1)
+
+        # integrate speaker embedding
+        if self.spk_embed_dim is not None:
+            hs = self._integrate_with_spk_embed(hs, spembs)
+
+        # forward duration predictor and length regulator
+        d_masks = make_pad_mask(ilens).to(xs.device)
+        if is_inference:
+            d_outs = self.duration_predictor.inference(hs, d_masks)  # (B, Tmax)
+            hs = self.length_regulator(hs, d_outs, ilens, alpha)  # (B, Lmax, adim)
+        else:
+            d_outs = self.duration_predictor(hs, d_masks)  # (B, Tmax)
+            hs = self.length_regulator(hs, ds, ilens)  # (B, Lmax, adim)
+
+        # forward decoder
+        if olens is not None and not is_inference:
+            if self.reduction_factor > 1:
+                olens_in = olens.new([olen // self.reduction_factor for olen in olens])
+            else:
+                olens_in = olens
+            h_masks = self._source_mask(olens_in)
+        else:
+            h_masks = None
+        zs, _ = self.decoder(hs, h_masks)  # (B, Lmax, adim)
+        before_outs = self.feat_out(zs).view(
+            zs.size(0), -1, self.odim
+        )  # (B, Lmax, odim)
+
+        # postnet -> (B, Lmax//r * r, odim)
+        if self.postnet is None:
+            after_outs = before_outs
+        else:
+            after_outs = before_outs + self.postnet(
+                before_outs.transpose(1, 2)
+            ).transpose(1, 2)
+
+        if is_inference:
+            return before_outs, after_outs, d_outs
+        else:
+            return before_outs, after_outs, ds, d_outs
+
+    def forward(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        durations: torch.Tensor,
+        durations_lengths: torch.Tensor,
+        spembs: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            text (LongTensor): Batch of padded character ids (B, Tmax).
+            text_lengths (LongTensor): Batch of lengths of each input (B,).
+            speech (Tensor): Batch of padded target features (B, Lmax, odim).
+            speech_lengths (LongTensor): Batch of the lengths of each target (B,).
+            durations (LongTensor): Batch of padded durations (B, Tmax).
+            durations_lengths (LongTensor): Batch of lengths of each duration (B, Tmax).
+            spembs (Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Loss scalar value.
+            Dict: Statistics to be monitored.
+            Tensor: Weight value.
+
+        """
+        text = text[:, : text_lengths.max()]  # for data-parallel
+        speech = speech[:, : speech_lengths.max()]  # for data-parallel
+        durations = durations[:, : durations_lengths.max()]  # for data-parallel
+
+        batch_size = text.size(0)
+
+        # Add eos at the last of sequence
+        xs = F.pad(text, [0, 1], "constant", self.padding_idx)
+        for i, l in enumerate(text_lengths):
+            xs[i, l] = self.eos
+        ilens = text_lengths + 1
+
+        ys, ds = speech, durations
+        olens = speech_lengths
+
+        # forward propagation
+        before_outs, after_outs, ds, d_outs = self._forward(
+            xs, ilens, ys, olens, ds, spembs=spembs, is_inference=False
+        )
+
+        # modifiy mod part of groundtruth
+        if self.reduction_factor > 1:
+            olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
+            max_olen = max(olens)
+            ys = ys[:, :max_olen]
+
+        # calculate loss
+        if self.postnet is None:
+            l1_loss, duration_loss = self.criterion(
+                None, before_outs, d_outs, ys, ds, ilens, olens
+            )
+        else:
+            l1_loss, duration_loss = self.criterion(
+                after_outs, before_outs, d_outs, ys, ds, ilens, olens
+            )
+        loss = l1_loss + duration_loss
+
+        stats = dict(
+            l1_loss=l1_loss.item(),
+            duration_loss=duration_loss.item(),
+            loss=loss.item(),
+        )
+
+        # report extra information
+        if self.use_scaled_pos_enc:
+            stats.update(
+                encoder_alpha=self.encoder.embed[-1].alpha.data.item(),
+                decoder_alpha=self.decoder.embed[-1].alpha.data.item(),
+            )
+
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+
+    def inference(
+        self,
+        text: torch.Tensor,
+        speech: torch.Tensor = None,
+        spembs: torch.Tensor = None,
+        alpha: float = 1.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Generate the sequence of features given the sequences of characters.
+
+        Args:
+            text (LongTensor): Input sequence of characters (T,).
+            speech (Tensor, optional): Feature sequence to extract style (N, idim).
+            spembs (Tensor, optional): Speaker embedding vector (spk_embed_dim,).
+            alpha (float, optional): Alpha to control the speed.
+
+        Returns:
+            Tensor: Output sequence of features (L, odim).
+            None: Dummy for compatibility.
+            None: Dummy for compatibility.
+
+        """
+        x = text
+        y = speech
+        spemb = spembs
+
+        # add eos at the last of sequence
+        x = F.pad(x, [0, 1], "constant", self.eos)
+
+        # setup batch axis
+        ilens = torch.tensor([x.shape[0]], dtype=torch.long, device=x.device)
+        xs = x.unsqueeze(0)
+        ys, olens = None, None
+        if y is not None:
+            ys = y.unsqueeze(0)
+            olens = torch.tensor([y.shape[0]], dtype=torch.long, device=y.device)
+        if spemb is not None:
+            spembs = spemb.unsqueeze(0)
+
+        # inference
+        _, outs, _ = self._forward(
+            xs, ilens, ys, olens, spembs=spembs, is_inference=True, alpha=alpha,
+        )  # (1, L, odim)
+
+        return outs[0], None, None
+
+    def _integrate_with_spk_embed(
+        self, hs: torch.Tensor, spembs: torch.Tensor
+    ) -> torch.Tensor:
+        """Integrate speaker embedding with hidden states.
+
+        Args:
+            hs (Tensor): Batch of hidden state sequences (B, Tmax, adim).
+            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Batch of integrated hidden state sequences (B, Tmax, adim).
+
+        """
+        if self.spk_embed_integration_type == "add":
+            # apply projection and then add to hidden states
+            spembs = self.projection(F.normalize(spembs))
+            hs = hs + spembs.unsqueeze(1)
+        elif self.spk_embed_integration_type == "concat":
+            # concat hidden states with spk embeds and then apply projection
+            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.size(1), -1)
+            hs = self.projection(torch.cat([hs, spembs], dim=-1))
+        else:
+            raise NotImplementedError("support only add or concat.")
+
+        return hs
+
+    def _source_mask(self, ilens: torch.Tensor) -> torch.Tensor:
+        """Make masks for self-attention.
+
+        Args:
+            ilens (LongTensor): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for self-attention.
+                dtype=torch.uint8 in PyTorch 1.2-
+                dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+
+        Examples:
+            >>> ilens = [5, 3]
+            >>> self._source_mask(ilens)
+            tensor([[[1, 1, 1, 1, 1],
+                     [1, 1, 1, 0, 0]]], dtype=torch.uint8)
+
+        """
+        x_masks = make_non_pad_mask(ilens).to(next(self.parameters()).device)
+        return x_masks.unsqueeze(-2)
+
+    def _reset_parameters(
+        self, init_type: str, init_enc_alpha: float, init_dec_alpha: float
+    ):
+        # initialize parameters
+        if init_type != "pytorch":
+            initialize(self, init_type)
+
+        # initialize alpha in scaled positional encoding
+        if self.use_scaled_pos_enc:
+            self.encoder.embed[-1].alpha.data = torch.tensor(init_enc_alpha)
+            self.decoder.embed[-1].alpha.data = torch.tensor(init_dec_alpha)
diff --git a/espnet2/tts/gst/style_encoder.py b/espnet2/tts/gst/style_encoder.py
new file mode 100644
index 00000000000..33ac0210b29
--- /dev/null
+++ b/espnet2/tts/gst/style_encoder.py
@@ -0,0 +1,272 @@
+# Copyright 2020 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Style encoder of GST-Tacotron."""
+
+from typeguard import check_argument_types
+from typing import Sequence
+
+import torch
+
+from espnet.nets.pytorch_backend.transformer.attention import (
+    MultiHeadedAttention as BaseMultiHeadedAttention,  # NOQA
+)
+
+
+class StyleEncoder(torch.nn.Module):
+    """Style encoder.
+
+    This module is style encoder introduced in `Style Tokens: Unsupervised Style
+    Modeling, Control and Transfer in End-to-End Speech Synthesis`.
+
+    .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
+        Speech Synthesis`: https://arxiv.org/abs/1803.09017
+
+    Args:
+        idim (int, optional): Dimension of the input mel-spectrogram.
+        gst_tokens (int, optional): The number of GST embeddings.
+        gst_token_dim (int, optional): Dimension of each GST embedding.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
+        conv_layers (int, optional): The number of conv layers in the reference encoder.
+        conv_chans_list: (Sequence[int], optional):
+            List of the number of channels of conv layers in the referece encoder.
+        conv_kernel_size (int, optional):
+            Kernal size of conv layers in the reference encoder.
+        conv_stride (int, optional):
+            Stride size of conv layers in the reference encoder.
+        gru_layers (int, optional): The number of GRU layers in the reference encoder.
+        gru_units (int, optional): The number of GRU units in the reference encoder.
+
+    Todo:
+        * Support manual weight specification in inference.
+
+    """
+
+    def __init__(
+        self,
+        idim: int = 80,
+        gst_tokens: int = 10,
+        gst_token_dim: int = 256,
+        gst_heads: int = 4,
+        conv_layers: int = 6,
+        conv_chans_list: Sequence[int] = (32, 32, 64, 64, 128, 128),
+        conv_kernel_size: int = 3,
+        conv_stride: int = 2,
+        gru_layers: int = 1,
+        gru_units: int = 128,
+    ):
+        """Initilize global style encoder module."""
+        assert check_argument_types()
+        super(StyleEncoder, self).__init__()
+
+        self.ref_enc = ReferenceEncoder(
+            idim=idim,
+            conv_layers=conv_layers,
+            conv_chans_list=conv_chans_list,
+            conv_kernel_size=conv_kernel_size,
+            conv_stride=conv_stride,
+            gru_layers=gru_layers,
+            gru_units=gru_units,
+        )
+        self.stl = StyleTokenLayer(
+            ref_embed_dim=gru_units,
+            gst_tokens=gst_tokens,
+            gst_token_dim=gst_token_dim,
+            gst_heads=gst_heads,
+        )
+
+    def forward(self, speech: torch.Tensor) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            speech (Tensor): Batch of padded target features (B, Lmax, odim).
+
+        Returns:
+            Tensor: Style token embeddings (B, token_dim).
+
+        """
+        ref_embs = self.ref_enc(speech)
+        style_embs = self.stl(ref_embs)
+
+        return style_embs
+
+
+class ReferenceEncoder(torch.nn.Module):
+    """Reference encoder module.
+
+    This module is refernece encoder introduced in `Style Tokens: Unsupervised Style
+    Modeling, Control and Transfer in End-to-End Speech Synthesis`.
+
+    .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
+        Speech Synthesis`: https://arxiv.org/abs/1803.09017
+
+    Args:
+        idim (int, optional): Dimension of the input mel-spectrogram.
+        conv_layers (int, optional): The number of conv layers in the reference encoder.
+        conv_chans_list: (Sequence[int], optional):
+            List of the number of channels of conv layers in the referece encoder.
+        conv_kernel_size (int, optional):
+            Kernal size of conv layers in the reference encoder.
+        conv_stride (int, optional):
+            Stride size of conv layers in the reference encoder.
+        gru_layers (int, optional): The number of GRU layers in the reference encoder.
+        gru_units (int, optional): The number of GRU units in the reference encoder.
+
+    """
+
+    def __init__(
+        self,
+        idim=80,
+        conv_layers: int = 6,
+        conv_chans_list: Sequence[int] = (32, 32, 64, 64, 128, 128),
+        conv_kernel_size: int = 3,
+        conv_stride: int = 2,
+        gru_layers: int = 1,
+        gru_units: int = 128,
+    ):
+        """Initilize reference encoder module."""
+        assert check_argument_types()
+        super(ReferenceEncoder, self).__init__()
+
+        # check hyperparameters are valid
+        assert conv_kernel_size % 2 == 1, "kernel size must be odd."
+        assert (
+            len(conv_chans_list) == conv_layers
+        ), "the number of conv layers and length of channels list must be the same."
+
+        convs = []
+        padding = (conv_kernel_size - 1) // 2
+        for i in range(conv_layers):
+            conv_in_chans = 1 if i == 0 else conv_chans_list[i - 1]
+            conv_out_chans = conv_chans_list[i]
+            convs += [
+                torch.nn.Conv2d(
+                    conv_in_chans,
+                    conv_out_chans,
+                    kernel_size=conv_kernel_size,
+                    stride=conv_stride,
+                    padding=padding,
+                    # Do not use bias due to the following batch norm
+                    bias=False,
+                ),
+                torch.nn.BatchNorm2d(conv_out_chans),
+                torch.nn.ReLU(inplace=True),
+            ]
+        self.convs = torch.nn.Sequential(*convs)
+
+        self.conv_layers = conv_layers
+        self.kernel_size = conv_kernel_size
+        self.stride = conv_stride
+        self.padding = padding
+
+        # get the number of GRU input units
+        gru_in_units = idim
+        for i in range(conv_layers):
+            gru_in_units = (
+                gru_in_units - conv_kernel_size + 2 * padding
+            ) // conv_stride + 1
+        gru_in_units *= conv_out_chans
+        self.gru = torch.nn.GRU(gru_in_units, gru_units, gru_layers, batch_first=True)
+
+    def forward(self, speech: torch.Tensor) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            speech (Tensor): Batch of padded target features (B, Lmax, idim).
+
+        Returns:
+            Tensor: Reference embedding (B, gru_units)
+
+        """
+        batch_size = speech.size(0)
+        xs = speech.unsqueeze(1)  # (B, 1, Lmax, idim)
+        hs = self.convs(xs).transpose(1, 2)  # (B, Lmax', conv_out_chans, idim')
+        # NOTE(kan-bayashi): We need to care the length?
+        time_length = hs.size(1)
+        hs = hs.contiguous().view(batch_size, time_length, -1)  # (B, Lmax', gru_units)
+        self.gru.flatten_parameters()
+        _, ref_embs = self.gru(hs)  # (gru_layers, batch_size, gru_units)
+        ref_embs = ref_embs[-1]  # (batch_size, gru_units)
+
+        return ref_embs
+
+
+class StyleTokenLayer(torch.nn.Module):
+    """Style token layer module.
+
+    This module is style token layer introduced in `Style Tokens: Unsupervised Style
+    Modeling, Control and Transfer in End-to-End Speech Synthesis`.
+
+    .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
+        Speech Synthesis`: https://arxiv.org/abs/1803.09017
+
+    Args:
+        ref_embed_dim (int, optional): Dimension of the input reference embedding.
+        gst_tokens (int, optional): The number of GST embeddings.
+        gst_token_dim (int, optional): Dimension of each GST embedding.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
+        dropout_rate (float, optional): Dropout rate in multi-head attention.
+
+    """
+
+    def __init__(
+        self,
+        ref_embed_dim: int = 128,
+        gst_tokens: int = 10,
+        gst_token_dim: int = 256,
+        gst_heads: int = 4,
+        dropout_rate: float = 0.0,
+    ):
+        """Initilize style token layer module."""
+        assert check_argument_types()
+        super(StyleTokenLayer, self).__init__()
+
+        gst_embs = torch.randn(gst_tokens, gst_token_dim // gst_heads)
+        self.register_parameter("gst_embs", torch.nn.Parameter(gst_embs))
+        self.mha = MultiHeadedAttention(
+            q_dim=ref_embed_dim,
+            k_dim=gst_token_dim // gst_heads,
+            v_dim=gst_token_dim // gst_heads,
+            n_head=gst_heads,
+            n_feat=gst_token_dim,
+            dropout_rate=dropout_rate,
+        )
+
+    def forward(self, ref_embs: torch.Tensor) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            ref_embs (Tensor): Reference embeddings (B, ref_embed_dim).
+
+        Returns:
+            Tensor: Style token embeddings (B, gst_token_dim).
+
+        """
+        batch_size = ref_embs.size(0)
+        # (num_tokens, token_dim) -> (batch_size, num_tokens, token_dim)
+        gst_embs = torch.tanh(self.gst_embs).unsqueeze(0).expand(batch_size, -1, -1)
+        # NOTE(kan-bayashi): Shoule we apply Tanh?
+        ref_embs = ref_embs.unsqueeze(1)  # (batch_size, 1 ,ref_embed_dim)
+        style_embs = self.mha(ref_embs, gst_embs, gst_embs, None)
+
+        return style_embs.squeeze(1)
+
+
+class MultiHeadedAttention(BaseMultiHeadedAttention):
+    """Multi head attention module with different input dimension."""
+
+    def __init__(self, q_dim, k_dim, v_dim, n_head, n_feat, dropout_rate=0.0):
+        """Initialize multi head attention module."""
+        # NOTE(kan-bayashi): Do not use super().__init__() here since we want to
+        #   overwrite BaseMultiHeadedAttention.__init__() method.
+        torch.nn.Module.__init__(self)
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = torch.nn.Linear(q_dim, n_feat)
+        self.linear_k = torch.nn.Linear(k_dim, n_feat)
+        self.linear_v = torch.nn.Linear(v_dim, n_feat)
+        self.linear_out = torch.nn.Linear(n_feat, n_feat)
+        self.attn = None
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
diff --git a/espnet2/tts/tacotron2.py b/espnet2/tts/tacotron2.py
index 342ca5a5759..4e6e6a06022 100644
--- a/espnet2/tts/tacotron2.py
+++ b/espnet2/tts/tacotron2.py
@@ -1,10 +1,11 @@
-# Copyright 2018 Nagoya University (Tomoki Hayashi)
+# Copyright 2020 Nagoya University (Tomoki Hayashi)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
 """Tacotron 2 related modules for ESPnet2."""
 
 import logging
 from typing import Dict
+from typing import Sequence
 from typing import Tuple
 
 import torch
@@ -21,6 +22,7 @@
 from espnet.nets.pytorch_backend.tacotron2.encoder import Encoder
 from espnet2.torch_utils.device_funcs import force_gatherable
 from espnet2.tts.abs_tts import AbsTTS
+from espnet2.tts.gst.style_encoder import StyleEncoder
 
 
 class Tacotron2(AbsTTS):
@@ -61,6 +63,16 @@ class Tacotron2(AbsTTS):
         reduction_factor (int, optional): Reduction factor.
         spk_embed_dim (int, optional): Number of speaker embedding dimenstions.
         spk_embed_integration_type (str, optional): How to integrate speaker embedding.
+        use_gst (str, optional): Whether to use global style token.
+        gst_tokens (int, optional): The number of GST embeddings.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
+        gst_conv_layers (int, optional): The number of conv layers in GST.
+        gst_conv_chans_list: (Sequence[int], optional):
+            List of the number of channels of conv layers in GST.
+        gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST.
+        gst_conv_stride (int, optional): Stride size of conv layers in GST.
+        gst_gru_layers (int, optional): The number of GRU layers in GST.
+        gst_gru_units (int, optional): The number of GRU units in GST.
         dropout_rate (float, optional): Dropout rate.
         zoneout_rate (float, optional): Zoneout rate.
         use_masking (bool, optional): Whether to mask padded part in loss calculation.
@@ -105,6 +117,15 @@ def __init__(
         reduction_factor: int = 1,
         spk_embed_dim: int = None,
         spk_embed_integration_type: str = "concat",
+        use_gst: bool = False,
+        gst_tokens: int = 10,
+        gst_heads: int = 4,
+        gst_conv_layers: int = 6,
+        gst_conv_chans_list: Sequence[int] = (32, 32, 64, 64, 128, 128),
+        gst_conv_kernel_size: int = 3,
+        gst_conv_stride: int = 2,
+        gst_gru_layers: int = 1,
+        gst_gru_units: int = 128,
         # training related
         dropout_rate: float = 0.5,
         zoneout_rate: float = 0.1,
@@ -127,6 +148,7 @@ def __init__(
         self.spk_embed_dim = spk_embed_dim
         self.cumulate_att_w = cumulate_att_w
         self.reduction_factor = reduction_factor
+        self.use_gst = use_gst
         self.use_guided_attn_loss = use_guided_attn_loss
         self.loss_type = loss_type
         if self.spk_embed_dim is not None:
@@ -161,6 +183,20 @@ def __init__(
             padding_idx=padding_idx,
         )
 
+        if self.use_gst:
+            self.gst = StyleEncoder(
+                idim=odim,  # the input is mel-spectrogram
+                gst_tokens=gst_tokens,
+                gst_token_dim=eunits,
+                gst_heads=gst_heads,
+                conv_layers=gst_conv_layers,
+                conv_chans_list=gst_conv_chans_list,
+                conv_kernel_size=gst_conv_kernel_size,
+                conv_stride=gst_conv_stride,
+                gru_layers=gst_gru_layers,
+                gru_units=gst_gru_units,
+            )
+
         if spk_embed_dim is None:
             dec_idim = eunits
         elif spk_embed_integration_type == "concat":
@@ -247,6 +283,7 @@ def forward(
         speech = speech[:, : speech_lengths.max()]  # for data-parallel
 
         batch_size = text.size(0)
+
         # Add eos at the last of sequence
         xs = F.pad(text, [0, 1], "constant", self.padding_idx)
         for i, l in enumerate(text_lengths):
@@ -261,10 +298,9 @@ def forward(
         labels = F.pad(labels, [0, 1], "constant", 1.0)
 
         # calculate tacotron2 outputs
-        hs, hlens = self.enc(xs, ilens)
-        if self.spk_embed_dim is not None:
-            hs = self._integrate_with_spk_embed(hs, spembs)
-        after_outs, before_outs, logits, att_ws = self.dec(hs, hlens, ys)
+        after_outs, before_outs, logits, att_ws = self._forward(
+            xs, ilens, ys, olens, spembs
+        )
 
         # modify mod part of groundtruth
         if self.reduction_factor > 1:
@@ -308,9 +344,26 @@ def forward(
         loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
         return loss, stats, weight
 
+    def _forward(
+        self,
+        xs: torch.Tensor,
+        ilens: torch.Tensor,
+        ys: torch.Tensor,
+        olens: torch.Tensor,
+        spembs: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        hs, hlens = self.enc(xs, ilens)
+        if self.use_gst:
+            style_embs = self.gst(ys)
+            hs = hs + style_embs.unsqueeze(1)
+        if self.spk_embed_dim is not None:
+            hs = self._integrate_with_spk_embed(hs, spembs)
+        return self.dec(hs, hlens, ys)
+
     def inference(
         self,
         text: torch.Tensor,
+        speech: torch.Tensor = None,
         spembs: torch.Tensor = None,
         threshold: float = 0.5,
         minlenratio: float = 0.0,
@@ -318,11 +371,13 @@ def inference(
         use_att_constraint: bool = False,
         backward_window: int = 1,
         forward_window: int = 3,
+        use_teacher_forcing: bool = False,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """Generate the sequence of features given the sequences of characters.
 
         Args:
             text (LongTensor): Input sequence of characters (T,).
+            speech (Tensor, optional): Feature sequence to extract style (N, idim).
             spembs (Tensor, optional): Speaker embedding vector (spk_embed_dim,).
             threshold (float, optional): Threshold in inference.
             minlenratio (float, optional): Minimum length ratio in inference.
@@ -330,6 +385,7 @@ def inference(
             use_att_constraint (bool, optional): Whether to apply attention constraint.
             backward_window (int, optional): Backward window in attention constraint.
             forward_window (int, optional): Forward window in attention constraint.
+            use_teacher_forcing (bool, optional): Whether to use teacher forcing.
 
         Returns:
             Tensor: Output sequence of features (L, odim).
@@ -338,13 +394,29 @@ def inference(
 
         """
         x = text
+        y = speech
         spemb = spembs
 
         # add eos at the last of sequence
         x = F.pad(x, [0, 1], "constant", self.eos)
 
+        # inference with teacher forcing
+        if use_teacher_forcing:
+            assert speech is not None, "speech must be provided with teacher forcing."
+
+            xs, ys = x.unsqueeze(0), y.unsqueeze(0)
+            spembs = None if spemb is None else spemb.unsqueeze(0)
+            ilens = x.new_tensor([xs.size(1)]).long()
+            olens = y.new_tensor([ys.size(1)]).long()
+            outs, _, _, att_ws = self._forward(xs, ilens, ys, olens, spembs)
+
+            return outs[0], None, att_ws[0]
+
         # inference
         h = self.enc.inference(x)
+        if self.use_gst:
+            style_emb = self.gst(y.unsqueeze(0))
+            h = h + style_emb
         if self.spk_embed_dim is not None:
             hs, spembs = h.unsqueeze(0), spemb.unsqueeze(0)
             h = self._integrate_with_spk_embed(hs, spembs)[0]
@@ -360,7 +432,9 @@ def inference(
 
         return outs, probs, att_ws
 
-    def _integrate_with_spk_embed(self, hs, spembs):
+    def _integrate_with_spk_embed(
+        self, hs: torch.Tensor, spembs: torch.Tensor
+    ) -> torch.Tensor:
         """Integrate speaker embedding with hidden states.
 
         Args:
diff --git a/espnet2/tts/transformer.py b/espnet2/tts/transformer.py
index b2820486231..bbff104bd58 100644
--- a/espnet2/tts/transformer.py
+++ b/espnet2/tts/transformer.py
@@ -4,7 +4,7 @@
 """TTS-Transformer related modules."""
 
 from typing import Dict
-from typing import List
+from typing import Sequence
 from typing import Tuple
 
 import torch
@@ -27,6 +27,7 @@
 from espnet2.torch_utils.device_funcs import force_gatherable
 from espnet2.torch_utils.initialize import initialize
 from espnet2.tts.abs_tts import AbsTTS
+from espnet2.tts.gst.style_encoder import StyleEncoder
 
 
 class Transformer(AbsTTS):
@@ -79,6 +80,16 @@ class Transformer(AbsTTS):
         reduction_factor (int, optional): Reduction factor.
         spk_embed_dim (int, optional): Number of speaker embedding dimenstions.
         spk_embed_integration_type (str, optional): How to integrate speaker embedding.
+        use_gst (str, optional): Whether to use global style token.
+        gst_tokens (int, optional): The number of GST embeddings.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
+        gst_conv_layers (int, optional): The number of conv layers in GST.
+        gst_conv_chans_list: (Sequence[int], optional):
+            List of the number of channels of conv layers in GST.
+        gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST.
+        gst_conv_stride (int, optional): Stride size of conv layers in GST.
+        gst_gru_layers (int, optional): The number of GRU layers in GST.
+        gst_gru_units (int, optional): The number of GRU units in GST.
         transformer_lr (float, optional): Initial value of learning rate.
         transformer_warmup_steps (int, optional): Optimizer warmup steps.
         transformer_enc_dropout_rate (float, optional):
@@ -116,7 +127,7 @@ class Transformer(AbsTTS):
             Number of heads in each layer to apply guided attention loss.
         num_layers_applied_guided_attn (int, optional):
             Number of layers to apply guided attention loss.
-        modules_applied_guided_attn (List, optional):
+        modules_applied_guided_attn (Sequence[str], optional):
             List of module names to apply guided attention loss.
         guided_attn_loss_sigma (float, optional) Sigma in guided attention loss.
         guided_attn_loss_lambda (float, optional): Lambda in guided attention loss.
@@ -154,6 +165,15 @@ def __init__(
         reduction_factor: int = 1,
         spk_embed_dim: int = None,
         spk_embed_integration_type: str = "add",
+        use_gst: bool = False,
+        gst_tokens: int = 10,
+        gst_heads: int = 4,
+        gst_conv_layers: int = 6,
+        gst_conv_chans_list: Sequence[int] = (32, 32, 64, 64, 128, 128),
+        gst_conv_kernel_size: int = 3,
+        gst_conv_stride: int = 2,
+        gst_gru_layers: int = 1,
+        gst_gru_units: int = 128,
         # training related
         transformer_enc_dropout_rate: float = 0.1,
         transformer_enc_positional_dropout_rate: float = 0.1,
@@ -175,7 +195,7 @@ def __init__(
         use_guided_attn_loss: bool = True,
         num_heads_applied_guided_attn: int = 2,
         num_layers_applied_guided_attn: int = 2,
-        modules_applied_guided_attn: List[str] = ["encoder-decoder"],
+        modules_applied_guided_attn: Sequence[str] = ("encoder-decoder"),
         guided_attn_loss_sigma: float = 0.4,
         guided_attn_loss_lambda: float = 1.0,
     ):
@@ -189,6 +209,7 @@ def __init__(
         self.eos = idim - 1
         self.spk_embed_dim = spk_embed_dim
         self.reduction_factor = reduction_factor
+        self.use_gst = use_gst
         self.use_guided_attn_loss = use_guided_attn_loss
         self.use_scaled_pos_enc = use_scaled_pos_enc
         self.loss_type = loss_type
@@ -252,6 +273,21 @@ def __init__(
             positionwise_conv_kernel_size=positionwise_conv_kernel_size,
         )
 
+        # define GST
+        if self.use_gst:
+            self.gst = StyleEncoder(
+                idim=odim,  # the input is mel-spectrogram
+                gst_tokens=gst_tokens,
+                gst_token_dim=adim,
+                gst_heads=gst_heads,
+                conv_layers=gst_conv_layers,
+                conv_chans_list=gst_conv_chans_list,
+                conv_kernel_size=gst_conv_kernel_size,
+                conv_stride=gst_conv_stride,
+                gru_layers=gst_gru_layers,
+                gru_units=gst_gru_units,
+            )
+
         # define projection layer
         if self.spk_embed_dim is not None:
             if self.spk_embed_integration_type == "add":
@@ -344,8 +380,6 @@ def forward(
         speech: torch.Tensor,
         speech_lengths: torch.Tensor,
         spembs: torch.Tensor = None,
-        *args,
-        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Calculate forward propagation.
 
@@ -379,42 +413,13 @@ def forward(
         labels = make_pad_mask(olens - 1).to(ys.device, ys.dtype)
         labels = F.pad(labels, [0, 1], "constant", 1.0)
 
-        # forward encoder
-        x_masks = self._source_mask(ilens)
-        hs, h_masks = self.encoder(xs, x_masks)
-
-        # integrate speaker embedding
-        if self.spk_embed_dim is not None:
-            hs = self._integrate_with_spk_embed(hs, spembs)
-
-        # thin out frames for reduction factor (B, Lmax, odim) ->  (B, Lmax//r, odim)
-        if self.reduction_factor > 1:
-            ys_in = ys[:, self.reduction_factor - 1 :: self.reduction_factor]
-            olens_in = olens.new([olen // self.reduction_factor for olen in olens])
-        else:
-            ys_in, olens_in = ys, olens
-
-        # add first zero frame and remove last frame for auto-regressive
-        ys_in = self._add_first_frame_and_remove_last_frame(ys_in)
-
-        # forward decoder
-        y_masks = self._target_mask(olens_in)
-        zs, _ = self.decoder(ys_in, y_masks, hs, h_masks)
-        # (B, Lmax//r, odim * r) -> (B, Lmax//r * r, odim)
-        before_outs = self.feat_out(zs).view(zs.size(0), -1, self.odim)
-        # (B, Lmax//r, r) -> (B, Lmax//r * r)
-        logits = self.prob_out(zs).view(zs.size(0), -1)
-
-        # postnet -> (B, Lmax//r * r, odim)
-        if self.postnet is None:
-            after_outs = before_outs
-        else:
-            after_outs = before_outs + self.postnet(
-                before_outs.transpose(1, 2)
-            ).transpose(1, 2)
+        # calculate transformer outputs
+        after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens, spembs)
 
         # modifiy mod part of groundtruth
+        olens_in = olens
         if self.reduction_factor > 1:
+            olens_in = olens.new([olen // self.reduction_factor for olen in olens])
             olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
             max_olen = max(olens)
             ys = ys[:, :max_olen]
@@ -504,22 +509,75 @@ def forward(
         loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
         return loss, stats, weight
 
+    def _forward(
+        self,
+        xs: torch.Tensor,
+        ilens: torch.Tensor,
+        ys: torch.Tensor,
+        olens: torch.Tensor,
+        spembs: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # forward encoder
+        x_masks = self._source_mask(ilens)
+        hs, h_masks = self.encoder(xs, x_masks)
+
+        # integrate with GST
+        if self.use_gst:
+            style_embs = self.gst(ys)
+            hs = hs + style_embs.unsqueeze(1)
+
+        # integrate speaker embedding
+        if self.spk_embed_dim is not None:
+            hs = self._integrate_with_spk_embed(hs, spembs)
+
+        # thin out frames for reduction factor (B, Lmax, odim) ->  (B, Lmax//r, odim)
+        if self.reduction_factor > 1:
+            ys_in = ys[:, self.reduction_factor - 1 :: self.reduction_factor]
+            olens_in = olens.new([olen // self.reduction_factor for olen in olens])
+        else:
+            ys_in, olens_in = ys, olens
+
+        # add first zero frame and remove last frame for auto-regressive
+        ys_in = self._add_first_frame_and_remove_last_frame(ys_in)
+
+        # forward decoder
+        y_masks = self._target_mask(olens_in)
+        zs, _ = self.decoder(ys_in, y_masks, hs, h_masks)
+        # (B, Lmax//r, odim * r) -> (B, Lmax//r * r, odim)
+        before_outs = self.feat_out(zs).view(zs.size(0), -1, self.odim)
+        # (B, Lmax//r, r) -> (B, Lmax//r * r)
+        logits = self.prob_out(zs).view(zs.size(0), -1)
+
+        # postnet -> (B, Lmax//r * r, odim)
+        if self.postnet is None:
+            after_outs = before_outs
+        else:
+            after_outs = before_outs + self.postnet(
+                before_outs.transpose(1, 2)
+            ).transpose(1, 2)
+
+        return after_outs, before_outs, logits
+
     def inference(
         self,
         text: torch.Tensor,
+        speech: torch.Tensor = None,
         spembs: torch.Tensor = None,
         threshold: float = 0.5,
         minlenratio: float = 0.0,
         maxlenratio: float = 10.0,
+        use_teacher_forcing: bool = False,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """Generate the sequence of features given the sequences of characters.
 
         Args:
             text (LongTensor): Input sequence of characters (T,).
+            speech (Tensor, optional): Feature sequence to extract style (N, idim).
             spembs (Tensor, optional): Speaker embedding vector (spk_embed_dim,).
             threshold (float, optional): Threshold in inference.
             minlenratio (float, optional): Minimum length ratio in inference.
             maxlenratio (float, optional): Maximum length ratio in inference.
+            use_teacher_forcing (bool, optional): Whether to use teacher forcing.
 
         Returns:
             Tensor: Output sequence of features (L, odim).
@@ -528,15 +586,40 @@ def inference(
 
         """
         x = text
+        y = speech
         spemb = spembs
 
         # add eos at the last of sequence
         x = F.pad(x, [0, 1], "constant", self.eos)
 
+        # inference with teacher forcing
+        if use_teacher_forcing:
+            assert speech is not None, "speech must be provided with teacher forcing."
+
+            # get teacher forcing outputs
+            xs, ys = x.unsqueeze(0), y.unsqueeze(0)
+            spembs = None if spemb is None else spemb.unsqueeze(0)
+            ilens = x.new_tensor([xs.size(1)]).long()
+            olens = y.new_tensor([ys.size(1)]).long()
+            outs, *_ = self._forward(xs, ilens, ys, olens, spembs)
+
+            # get attention weights
+            att_ws = []
+            for i in range(len(self.decoder.decoders)):
+                att_ws += [self.decoder.decoders[i].src_attn.attn]
+            att_ws = torch.stack(att_ws, dim=1)  # (B, L, H, T_out, T_in)
+
+            return outs[0], None, att_ws[0]
+
         # forward encoder
         xs = x.unsqueeze(0)
         hs, _ = self.encoder(xs, None)
 
+        # integrate GST
+        if self.use_gst:
+            style_embs = self.gst(y.unsqueeze(0))
+            hs = hs + style_embs.unsqueeze(1)
+
         # integrate speaker embedding
         if self.spk_embed_dim is not None:
             spembs = spemb.unsqueeze(0)
@@ -605,7 +688,7 @@ def inference(
 
         return outs, probs, att_ws
 
-    def _add_first_frame_and_remove_last_frame(self, ys):
+    def _add_first_frame_and_remove_last_frame(self, ys: torch.Tensor) -> torch.Tensor:
         ys_in = torch.cat(
             [ys.new_zeros((ys.shape[0], 1, ys.shape[2])), ys[:, :-1]], dim=1
         )
@@ -615,7 +698,7 @@ def _source_mask(self, ilens):
         """Make masks for self-attention.
 
         Args:
-            ilens (LongTensor or List): Batch of lengths (B,).
+            ilens (LongTensor): Batch of lengths (B,).
 
         Returns:
             Tensor: Mask tensor for self-attention.
@@ -632,16 +715,16 @@ def _source_mask(self, ilens):
         x_masks = make_non_pad_mask(ilens).to(next(self.parameters()).device)
         return x_masks.unsqueeze(-2)
 
-    def _target_mask(self, olens):
+    def _target_mask(self, olens: torch.Tensor) -> torch.Tensor:
         """Make masks for masked self-attention.
 
         Args:
-            olens (LongTensor or List): Batch of lengths (B,).
+            olens (LongTensor): Batch of lengths (B,).
 
         Returns:
             Tensor: Mask tensor for masked self-attention.
-                    dtype=torch.uint8 in PyTorch 1.2-
-                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+                dtype=torch.uint8 in PyTorch 1.2-
+                dtype=torch.bool in PyTorch 1.2+ (including 1.2)
 
         Examples:
             >>> olens = [5, 3]
@@ -662,7 +745,9 @@ def _target_mask(self, olens):
         s_masks = subsequent_mask(y_masks.size(-1), device=y_masks.device).unsqueeze(0)
         return y_masks.unsqueeze(-2) & s_masks
 
-    def _integrate_with_spk_embed(self, hs, spembs):
+    def _integrate_with_spk_embed(
+        self, hs: torch.Tensor, spembs: torch.Tensor
+    ) -> torch.Tensor:
         """Integrate speaker embedding with hidden states.
 
         Args:
diff --git a/espnet2/utils/config_argparse.py b/espnet2/utils/config_argparse.py
new file mode 100644
index 00000000000..c9d7197a747
--- /dev/null
+++ b/espnet2/utils/config_argparse.py
@@ -0,0 +1,47 @@
+import argparse
+from pathlib import Path
+
+import yaml
+
+
+class ArgumentParser(argparse.ArgumentParser):
+    """Simple implementation of ArgumentParser supporting config file
+
+    This class is originated from https://github.com/bw2/ConfigArgParse,
+    but this class is lack of some features that it has.
+
+    - Not supporting multiple config files
+    - Automatically adding "--config" as an option.
+    - Not supporting any formats other than yaml
+    - Not checking argument type
+
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.add_argument("--config", help="Give config file in yaml format")
+
+    def parse_known_args(self, args=None, namespace=None):
+        # Once parsing for setting from "--config"
+        _args, _ = super().parse_known_args(args, namespace)
+        if _args.config is not None:
+            if not Path(_args.config).exists():
+                self.error(f"No such file: {_args.config}")
+
+            with open(_args.config, "r", encoding="utf-8") as f:
+                d = yaml.safe_load(f)
+            if not isinstance(d, dict):
+                self.error("Config file has non dict value: {_args.config}")
+
+            for key in d:
+                for action in self._actions:
+                    if key == action.dest:
+                        break
+                else:
+                    self.error(f"unrecognized arguments: {key} (from {_args.config})")
+
+            # NOTE(kamo): Ignore "--config" from a config file
+            # NOTE(kamo): Unlike "configargparse", this module doesn't check type.
+            #   i.e. We can set any type value regardless of argument type.
+            self.set_defaults(**d)
+        return super().parse_known_args(args, namespace)
diff --git a/espnet2/utils/nested_dict_action.py b/espnet2/utils/nested_dict_action.py
index bd17774d75e..38ec57b31d0 100644
--- a/espnet2/utils/nested_dict_action.py
+++ b/espnet2/utils/nested_dict_action.py
@@ -73,8 +73,7 @@ def __call__(self, parser, namespace, values, option_strings=None):
                 if idx == len(keys) - 1:
                     d[k] = value
                 else:
-                    v = d.setdefault(k, {})
-                    if not isinstance(v, dict):
+                    if not isinstance(d.setdefault(k, {}), dict):
                         # Remove the existing value and recreates as empty dict
                         d[k] = {}
                     d = d[k]
@@ -82,7 +81,6 @@ def __call__(self, parser, namespace, values, option_strings=None):
             # Update the value
             setattr(namespace, self.dest, indict)
         else:
-            setattr(namespace, self.dest, values)
             try:
                 # At the first, try eval(), i.e. Python syntax dict.
                 # e.g. --{option} "{'a': 3}" -> {'a': 3}
@@ -99,5 +97,10 @@ def __call__(self, parser, namespace, values, option_strings=None):
                     syntax = self._syntax.format(op=option_strings)
                     mes = f"must be interpreted as dict: but got {values}\n{syntax}"
                     raise argparse.ArgumentError(self, mes)
-            # Remove existing params, and overwrite
-            setattr(namespace, self.dest, value)
+
+            d = getattr(namespace, self.dest, None)
+            if isinstance(d, dict):
+                d.update(value)
+            else:
+                # Remove existing params, and overwrite
+                setattr(namespace, self.dest, value)
diff --git a/setup.py b/setup.py
index 4d864495878..291929ef691 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@
         "dataclasses",  # For Python<3.7
         "humanfriendly",
         "scipy>=1.4.1",
-        "matplotlib>=3.1.0",
+        "matplotlib==3.1.0",
         "pillow>=6.1.0",
         "editdistance==0.5.2",
         "numba==0.49",
diff --git a/test/espnet2/bin/test_asr_inference.py b/test/espnet2/bin/test_asr_inference.py
index 0837dac7225..4847f563ea4 100644
--- a/test/espnet2/bin/test_asr_inference.py
+++ b/test/espnet2/bin/test_asr_inference.py
@@ -1,9 +1,16 @@
 from argparse import ArgumentParser
+from pathlib import Path
+import string
 
+import numpy as np
 import pytest
 
+from espnet.nets.beam_search import Hypothesis
 from espnet2.bin.asr_inference import get_parser
 from espnet2.bin.asr_inference import main
+from espnet2.bin.asr_inference import Speech2Text
+from espnet2.tasks.asr import ASRTask
+from espnet2.tasks.lm import LMTask
 
 
 def test_get_parser():
@@ -13,3 +20,63 @@ def test_get_parser():
 def test_main():
     with pytest.raises(SystemExit):
         main()
+
+
+@pytest.fixture()
+def token_list(tmp_path: Path):
+    with (tmp_path / "tokens.txt").open("w") as f:
+        f.write("<blank>\n")
+        for c in string.ascii_letters:
+            f.write(f"{c}\n")
+        f.write("<unk>\n")
+        f.write("<sos/eos>\n")
+    return tmp_path / "tokens.txt"
+
+
+@pytest.fixture()
+def asr_config_file(tmp_path: Path, token_list):
+    # Write default configuration file
+    ASRTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "asr"),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+        ]
+    )
+    return tmp_path / "asr" / "config.yaml"
+
+
+@pytest.fixture()
+def lm_config_file(tmp_path: Path, token_list):
+    # Write default configuration file
+    LMTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "lm"),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+        ]
+    )
+    return tmp_path / "lm" / "config.yaml"
+
+
+def test_Speech2Text(asr_config_file, lm_config_file):
+    speech2text = Speech2Text(
+        asr_train_config=asr_config_file, lm_train_config=lm_config_file, beam_size=1
+    )
+    speech = np.random.randn(100000)
+    results = speech2text(speech)
+    for text, token, token_int, hyp in results:
+        assert isinstance(text, str)
+        assert isinstance(token[0], str)
+        assert isinstance(token_int[0], int)
+        assert isinstance(hyp, Hypothesis)
diff --git a/test/espnet2/bin/test_tts_inference.py b/test/espnet2/bin/test_tts_inference.py
index 3b68fd6abf8..7f29c121b38 100644
--- a/test/espnet2/bin/test_tts_inference.py
+++ b/test/espnet2/bin/test_tts_inference.py
@@ -1,9 +1,13 @@
 from argparse import ArgumentParser
+from pathlib import Path
+import string
 
 import pytest
 
 from espnet2.bin.tts_inference import get_parser
 from espnet2.bin.tts_inference import main
+from espnet2.bin.tts_inference import Text2Speech
+from espnet2.tasks.tts import TTSTask
 
 
 def test_get_parser():
@@ -13,3 +17,44 @@ def test_get_parser():
 def test_main():
     with pytest.raises(SystemExit):
         main()
+
+
+@pytest.fixture()
+def token_list(tmp_path: Path):
+    with (tmp_path / "tokens.txt").open("w") as f:
+        f.write("<blank>\n")
+        for c in string.ascii_letters:
+            f.write(f"{c}\n")
+        f.write("<unk>\n")
+        f.write("<sos/eos>\n")
+    return tmp_path / "tokens.txt"
+
+
+@pytest.fixture()
+def config_file(tmp_path: Path, token_list):
+    # Write default configuration file
+    TTSTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+            "--cleaner",
+            "none",
+            "--g2p",
+            "none",
+            "--normalize",
+            "none",
+        ]
+    )
+    return tmp_path / "config.yaml"
+
+
+def test_Text2Speech(config_file):
+    text2speech = Text2Speech(train_config=config_file)
+    text = "aiueo"
+    text2speech(text)
diff --git a/test/espnet2/main_funcs/test_pack_funcs.py b/test/espnet2/main_funcs/test_pack_funcs.py
index fa02f5f0b75..7bbfb87fe2a 100644
--- a/test/espnet2/main_funcs/test_pack_funcs.py
+++ b/test/espnet2/main_funcs/test_pack_funcs.py
@@ -4,7 +4,6 @@
 import pytest
 import yaml
 
-from espnet2.main_funcs.pack_funcs import default_tarinfo
 from espnet2.main_funcs.pack_funcs import find_path_and_change_it_recursive
 from espnet2.main_funcs.pack_funcs import pack
 from espnet2.main_funcs.pack_funcs import unpack
@@ -16,12 +15,10 @@ def test_find_path_and_change_it_recursive():
     assert target == {"a": ["bar/path.npy"], "b": 3}
 
 
-def test_default_tarinfo():
-    # Just call
-    default_tarinfo("aaa")
-
-
-def test_pack_unpack(tmp_path: Path):
+@pytest.mark.parametrize(
+    "type", ["tgz", "tar", "tbz2", "txz", "zip"],
+)
+def test_pack_unpack(tmp_path: Path, type):
     files = {"abc.pth": str(tmp_path / "foo.pth")}
     with (tmp_path / "foo.pth").open("w"):
         pass
@@ -38,10 +35,10 @@ def test_pack_unpack(tmp_path: Path):
         files=files,
         yaml_files={"def.yaml": str(tmp_path / "bar.yaml")},
         option=[tmp_path / "a", tmp_path / "b" / "a"],
-        outpath=str(tmp_path / "out.tgz"),
+        outpath=str(tmp_path / f"out.{type}"),
     )
 
-    retval = unpack(str(tmp_path / "out.tgz"), str(tmp_path))
+    retval = unpack(str(tmp_path / f"out.{type}"), str(tmp_path))
     assert retval == {
         "abc": str(tmp_path / "packed" / "abc.pth"),
         "def": str(tmp_path / "packed" / "def.yaml"),
diff --git a/test/espnet2/tts/test_fastspeech.py b/test/espnet2/tts/test_fastspeech.py
new file mode 100644
index 00000000000..86e2b18e3c9
--- /dev/null
+++ b/test/espnet2/tts/test_fastspeech.py
@@ -0,0 +1,69 @@
+import pytest
+import torch
+
+from espnet2.tts.fastspeech import FastSpeech
+
+
+@pytest.mark.parametrize("postnet_layers", [0, 1])
+@pytest.mark.parametrize("reduction_factor", [1, 2, 3])
+@pytest.mark.parametrize(
+    "spk_embed_dim, spk_embed_integration_type",
+    [(None, "add"), (2, "add"), (2, "concat")],
+)
+@pytest.mark.parametrize("use_gst", [True, False])
+def test_fastspeech(
+    postnet_layers,
+    reduction_factor,
+    spk_embed_dim,
+    spk_embed_integration_type,
+    use_gst,
+):
+    model = FastSpeech(
+        idim=10,
+        odim=5,
+        adim=4,
+        aheads=2,
+        elayers=1,
+        eunits=4,
+        dlayers=1,
+        dunits=4,
+        postnet_layers=postnet_layers,
+        postnet_chans=4,
+        postnet_filts=5,
+        reduction_factor=reduction_factor,
+        spk_embed_dim=spk_embed_dim,
+        spk_embed_integration_type=spk_embed_integration_type,
+        use_gst=use_gst,
+        gst_tokens=2,
+        gst_heads=4,
+        gst_conv_layers=2,
+        gst_conv_chans_list=[2, 4],
+        gst_conv_kernel_size=3,
+        gst_conv_stride=2,
+        gst_gru_layers=1,
+        gst_gru_units=4,
+    )
+
+    inputs = dict(
+        text=torch.randint(1, 10, (2, 2)),
+        text_lengths=torch.tensor([2, 1], dtype=torch.long),
+        speech=torch.randn(2, 4 * reduction_factor, 5),
+        speech_lengths=torch.tensor([4, 2], dtype=torch.long) * reduction_factor,
+        durations=torch.tensor([[2, 2, 0], [2, 0, 0]], dtype=torch.long),
+        # NOTE(kan-bayashi): +1 for eos
+        durations_lengths=torch.tensor([2 + 1, 1 + 1], dtype=torch.long),
+    )
+    if spk_embed_dim is not None:
+        inputs.update(spembs=torch.randn(2, spk_embed_dim))
+    loss, *_ = model(**inputs)
+    loss.backward()
+
+    with torch.no_grad():
+        model.eval()
+
+        inputs = dict(text=torch.randint(0, 10, (2,)),)
+        if use_gst:
+            inputs.update(speech=torch.randn(5, 5))
+        if spk_embed_dim is not None:
+            inputs.update(spembs=torch.randn(spk_embed_dim))
+        model.inference(**inputs)
diff --git a/test/espnet2/tts/test_tacotron2.py b/test/espnet2/tts/test_tacotron2.py
index 87343af5d38..ae362104933 100644
--- a/test/espnet2/tts/test_tacotron2.py
+++ b/test/espnet2/tts/test_tacotron2.py
@@ -12,6 +12,7 @@
     "spk_embed_dim, spk_embed_integration_type",
     [(None, "add"), (2, "add"), (2, "concat")],
 )
+@pytest.mark.parametrize("use_gst", [True, False])
 @pytest.mark.parametrize("loss_type", ["L1+L2", "L1"])
 @pytest.mark.parametrize("use_guided_attn_loss", [True, False])
 def test_tacotron2(
@@ -21,6 +22,7 @@ def test_tacotron2(
     reduction_factor,
     spk_embed_dim,
     spk_embed_integration_type,
+    use_gst,
     loss_type,
     use_guided_attn_loss,
 ):
@@ -42,6 +44,15 @@ def test_tacotron2(
         reduction_factor=reduction_factor,
         spk_embed_dim=spk_embed_dim,
         spk_embed_integration_type=spk_embed_integration_type,
+        use_gst=use_gst,
+        gst_tokens=2,
+        gst_heads=4,
+        gst_conv_layers=2,
+        gst_conv_chans_list=[2, 4],
+        gst_conv_kernel_size=3,
+        gst_conv_stride=2,
+        gst_gru_layers=1,
+        gst_gru_units=4,
         loss_type=loss_type,
         use_guided_attn_loss=use_guided_attn_loss,
     )
@@ -59,7 +70,15 @@ def test_tacotron2(
 
     with torch.no_grad():
         model.eval()
+
+        # free running
         inputs = dict(text=torch.randint(0, 10, (2,)),)
+        if use_gst:
+            inputs.update(speech=torch.randn(5, 5))
         if spk_embed_dim is not None:
             inputs.update(spembs=torch.randn(spk_embed_dim))
         model.inference(**inputs, maxlenratio=1.0)
+
+        # teacher forcing
+        inputs.update(speech=torch.randn(5, 5))
+        model.inference(**inputs, use_teacher_forcing=True)
diff --git a/test/espnet2/tts/test_transformer.py b/test/espnet2/tts/test_transformer.py
index b71d872a331..876f5bd8d96 100644
--- a/test/espnet2/tts/test_transformer.py
+++ b/test/espnet2/tts/test_transformer.py
@@ -15,6 +15,7 @@
     "spk_embed_dim, spk_embed_integration_type",
     [(None, "add"), (2, "add"), (2, "concat")],
 )
+@pytest.mark.parametrize("use_gst", [True, False])
 @pytest.mark.parametrize("loss_type", ["L1+L2", "L1"])
 @pytest.mark.parametrize("use_guided_attn_loss", [True, False])
 @pytest.mark.parametrize(
@@ -28,6 +29,7 @@ def test_tranformer(
     reduction_factor,
     spk_embed_dim,
     spk_embed_integration_type,
+    use_gst,
     loss_type,
     use_guided_attn_loss,
     modules_applied_guided_attn,
@@ -41,7 +43,7 @@ def test_tranformer(
         dprenet_layers=dprenet_layers,
         dprenet_units=4,
         elayers=1,
-        eunits=4,
+        eunits=6,
         adim=4,
         aheads=2,
         dlayers=1,
@@ -56,6 +58,15 @@ def test_tranformer(
         reduction_factor=reduction_factor,
         spk_embed_dim=spk_embed_dim,
         spk_embed_integration_type=spk_embed_integration_type,
+        use_gst=use_gst,
+        gst_tokens=2,
+        gst_heads=4,
+        gst_conv_layers=2,
+        gst_conv_chans_list=[2, 4],
+        gst_conv_kernel_size=3,
+        gst_conv_stride=2,
+        gst_gru_layers=1,
+        gst_gru_units=4,
         loss_type=loss_type,
         use_guided_attn_loss=use_guided_attn_loss,
         modules_applied_guided_attn=modules_applied_guided_attn,
@@ -74,7 +85,15 @@ def test_tranformer(
 
     with torch.no_grad():
         model.eval()
+
+        # free running
         inputs = dict(text=torch.randint(0, 10, (2,)),)
+        if use_gst:
+            inputs.update(speech=torch.randn(5, 5))
         if spk_embed_dim is not None:
             inputs.update(spembs=torch.randn(spk_embed_dim))
         model.inference(**inputs, maxlenratio=1.0)
+
+        # teacher forcing
+        inputs.update(speech=torch.randn(5, 5))
+        model.inference(**inputs, use_teacher_forcing=True)
diff --git a/test/espnet2/utils/test_config_argparse.py b/test/espnet2/utils/test_config_argparse.py
new file mode 100644
index 00000000000..e7b73466f0c
--- /dev/null
+++ b/test/espnet2/utils/test_config_argparse.py
@@ -0,0 +1,49 @@
+import pytest
+import yaml
+
+from espnet2.utils import config_argparse
+
+
+@pytest.fixture()
+def parser():
+    _parser = config_argparse.ArgumentParser("test")
+    _parser.add_argument("--foo")
+    _parser.add_argument("--bar")
+    _parser.add_argument("--baz", action="store_true")
+    _parser.add_argument("--count", action="count")
+    return _parser
+
+
+def test_config_argparse(tmpdir, parser):
+    config = tmpdir / "a.yaml"
+    with config.open("w") as f:
+        yaml.safe_dump({"foo": "2", "baz": True, "count": 3}, f)
+
+    args = parser.parse_args(["--config", str(config), "--bar", "4"])
+    assert args.foo == "2"
+    assert args.bar == "4"
+    assert args.baz
+    assert args.count == 3
+
+
+def test_config_argparse_config_not_found(parser):
+    with pytest.raises(SystemExit):
+        parser.parse_args(["--config", "dummy.yaml", "--bar", "4"])
+
+
+def test_config_argparse_not_dict(tmpdir, parser):
+    config = tmpdir / "a.yaml"
+    with config.open("w") as f:
+        yaml.safe_dump([1, 2, 3], f)
+
+    with pytest.raises(SystemExit):
+        parser.parse_args(["--config", str(config), "--bar", "4"])
+
+
+def test_config_argparse_invalid_key(tmpdir, parser):
+    config = tmpdir / "a.yaml"
+    with config.open("w") as f:
+        yaml.safe_dump({"foo": "2", "dummy": "aaa"}, f)
+
+    with pytest.raises(SystemExit):
+        parser.parse_args(["--config", str(config), "--bar", "4"])
diff --git a/test/test_e2e_asr_conformer.py b/test/test_e2e_asr_conformer.py
new file mode 100644
index 00000000000..f559cd32377
--- /dev/null
+++ b/test/test_e2e_asr_conformer.py
@@ -0,0 +1,386 @@
+import argparse
+import chainer
+import importlib
+import logging
+import numpy
+import pytest
+import torch
+
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+)
+
+
+def test_sequential():
+    class Masked(torch.nn.Module):
+        def forward(self, x, m):
+            return x, m
+
+    from espnet.nets.pytorch_backend.transformer.repeat import MultiSequential
+
+    f = MultiSequential(Masked(), Masked())
+    x = torch.randn(2, 3)
+    m = torch.randn(2, 3) > 0
+    assert len(f(x, m)) == 2
+    if torch.cuda.is_available():
+        f = torch.nn.DataParallel(f)
+        f.cuda()
+        assert len(f(x.cuda(), m.cuda())) == 2
+
+
+def subsequent_mask(size, backend="pytorch"):
+    # http://nlp.seas.harvard.edu/2018/04/03/attention.html
+    "Mask out subsequent positions."
+    attn_shape = (1, size, size)
+    subsequent_mask = numpy.triu(numpy.ones(attn_shape), k=1).astype("uint8")
+    if backend == "pytorch":
+        return torch.from_numpy(subsequent_mask) == 0
+    else:
+        return subsequent_mask == 0
+
+
+@pytest.mark.parametrize("module", ["pytorch"])
+def test_mask(module):
+    T = importlib.import_module(
+        "espnet.nets.{}_backend.e2e_asr_transformer".format(module)
+    )
+    m = T.subsequent_mask(3)
+    print(m)
+    print(subsequent_mask(3))
+    assert (m.unsqueeze(0) == subsequent_mask(3)).all()
+
+
+def make_arg(**kwargs):
+    defaults = dict(
+        adim=16,
+        aheads=2,
+        dropout_rate=0.0,
+        transformer_attn_dropout_rate=None,
+        elayers=2,
+        eunits=16,
+        dlayers=2,
+        dunits=16,
+        sym_space="<space>",
+        sym_blank="<blank>",
+        transformer_decoder_selfattn_layer_type="selfattn",
+        transformer_encoder_pos_enc_layer_type="rel_pos",
+        transformer_encoder_selfattn_layer_type="rel_selfattn",
+        macaron_style=True,
+        use_cnn_module=True,
+        cnn_module_kernel=31,
+        transformer_init="pytorch",
+        transformer_input_layer="conv2d",
+        transformer_length_normalized_loss=True,
+        report_cer=False,
+        report_wer=False,
+        mtlalpha=0.0,
+        lsm_weight=0.001,
+        wshare=4,
+        ldconv_encoder_kernel_length="21_23_25_27_29_31_33_35_37_39_41_43",
+        ldconv_decoder_kernel_length="11_13_15_17_19_21",
+        ldconv_usebias=False,
+        char_list=["<blank>", "a", "e", "i", "o", "u"],
+        ctc_type="warpctc",
+    )
+    defaults.update(kwargs)
+    return argparse.Namespace(**defaults)
+
+
+def prepare(backend, args):
+    idim = 40
+    odim = 5
+    T = importlib.import_module(
+        "espnet.nets.{}_backend.e2e_asr_conformer".format(backend)
+    )
+
+    model = T.E2E(idim, odim, args)
+    batchsize = 5
+    if backend == "pytorch":
+        x = torch.randn(batchsize, 40, idim)
+    else:
+        x = numpy.random.randn(batchsize, 40, idim).astype(numpy.float32)
+    ilens = [40, 30, 20, 15, 10]
+    n_token = odim - 1
+    if backend == "pytorch":
+        y = (torch.rand(batchsize, 10) * n_token % n_token).long()
+    else:
+        y = (numpy.random.rand(batchsize, 10) * n_token % n_token).astype(numpy.int32)
+    olens = [3, 9, 10, 2, 3]
+    for i in range(batchsize):
+        x[i, ilens[i] :] = -1
+        y[i, olens[i] :] = model.ignore_id
+
+    data = []
+    for i in range(batchsize):
+        data.append(
+            (
+                "utt%d" % i,
+                {
+                    "input": [{"shape": [ilens[i], idim]}],
+                    "output": [{"shape": [olens[i]]}],
+                },
+            )
+        )
+    if backend == "pytorch":
+        return model, x, torch.tensor(ilens), y, data
+    else:
+        return model, x, ilens, y, data
+
+
+@pytest.mark.parametrize("module", ["pytorch"])
+def test_transformer_mask(module):
+    args = make_arg()
+    model, x, ilens, y, data = prepare(module, args)
+    from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos
+    from espnet.nets.pytorch_backend.transformer.mask import target_mask
+
+    yi, yo = add_sos_eos(y, model.sos, model.eos, model.ignore_id)
+    y_mask = target_mask(yi, model.ignore_id)
+    y = model.decoder.embed(yi)
+    y[0, 3:] = float("nan")
+    a = model.decoder.decoders[0].self_attn
+    a(y, y, y, y_mask)
+    assert not numpy.isnan(a.attn[0, :, :3, :3].detach().numpy()).any()
+
+
+conformer_args = dict(
+    transformer_encoder_pos_enc_layer_type="rel_pos",
+    transformer_encoder_selfattn_layer_type="rel_selfattn",
+    macaron_style=True,
+    use_cnn_module=True,
+    cnn_module_kernel=31,
+)
+
+conformer_mcnn_args = dict(
+    transformer_encoder_pos_enc_layer_type="rel_pos",
+    transformer_encoder_selfattn_layer_type="rel_selfattn",
+    macaron_style=True,
+    use_cnn_module=False,
+    cnn_module_kernel=31,
+)
+
+conformer_mcnn_mmacaron_args = dict(
+    transformer_encoder_pos_enc_layer_type="rel_pos",
+    transformer_encoder_selfattn_layer_type="rel_selfattn",
+    macaron_style=False,
+    use_cnn_module=False,
+    cnn_module_kernel=31,
+)
+
+conformer_mcnn_mmacaron_mrelattn_args = dict(
+    transformer_encoder_pos_enc_layer_type="abs_pos",
+    transformer_encoder_selfattn_layer_type="selfattn",
+    macaron_style=False,
+    use_cnn_module=False,
+    cnn_module_kernel=31,
+)
+
+
+@pytest.mark.parametrize(
+    "module, model_dict",
+    [
+        ("pytorch", {}),
+        ("pytorch", conformer_args),
+        ("pytorch", conformer_mcnn_args),
+        ("pytorch", conformer_mcnn_mmacaron_args),
+        ("pytorch", conformer_mcnn_mmacaron_mrelattn_args),
+        ("pytorch", {"report_cer": True}),
+        ("pytorch", {"report_wer": True}),
+        ("pytorch", {"report_cer": True, "report_wer": True}),
+        ("pytorch", {"report_cer": True, "report_wer": True, "mtlalpha": 0.0}),
+        ("pytorch", {"report_cer": True, "report_wer": True, "mtlalpha": 1.0}),
+    ],
+)
+def test_transformer_trainable_and_decodable(module, model_dict):
+    args = make_arg(**model_dict)
+    model, x, ilens, y, data = prepare(module, args)
+
+    # check for pure CTC and pure Attention
+    if args.mtlalpha == 1:
+        assert model.decoder is None
+    elif args.mtlalpha == 0:
+        assert model.ctc is None
+
+    # test beam search
+    recog_args = argparse.Namespace(
+        beam_size=1,
+        penalty=0.0,
+        ctc_weight=0.0,
+        maxlenratio=1.0,
+        lm_weight=0,
+        minlenratio=0,
+        nbest=1,
+    )
+    if module == "pytorch":
+        # test trainable
+        optim = torch.optim.Adam(model.parameters(), 0.01)
+        loss = model(x, ilens, y)
+        optim.zero_grad()
+        loss.backward()
+        optim.step()
+
+        # test attention plot
+        attn_dict = model.calculate_all_attentions(x[0:1], ilens[0:1], y[0:1])
+        from espnet.nets.pytorch_backend.transformer import plot
+
+        plot.plot_multi_head_attention(data, attn_dict, "/tmp/espnet-test")
+
+        # test decodable
+        with torch.no_grad():
+            nbest = model.recognize(x[0, : ilens[0]].numpy(), recog_args)
+            print(y[0])
+            print(nbest[0]["yseq"][1:-1])
+    else:
+        # test trainable
+        optim = chainer.optimizers.Adam(0.01)
+        optim.setup(model)
+        loss, loss_ctc, loss_att, acc = model(x, ilens, y)
+        model.cleargrads()
+        loss.backward()
+        optim.update()
+
+        # test attention plot
+        attn_dict = model.calculate_all_attentions(x[0:1], ilens[0:1], y[0:1])
+        from espnet.nets.pytorch_backend.transformer import plot
+
+        plot.plot_multi_head_attention(data, attn_dict, "/tmp/espnet-test")
+
+        # test decodable
+        with chainer.no_backprop_mode():
+            nbest = model.recognize(x[0, : ilens[0]], recog_args)
+            print(y[0])
+            print(nbest[0]["yseq"][1:-1])
+
+
+def prepare_copy_task(d_model, d_ff=64, n=1):
+    T = importlib.import_module("espnet.nets.pytorch_backend.e2e_asr_conformer")
+    idim = 11
+    odim = idim
+
+    if d_model:
+        args = argparse.Namespace(
+            adim=d_model,
+            aheads=2,
+            dropout_rate=0.1,
+            elayers=n,
+            eunits=d_ff,
+            dlayers=n,
+            dunits=d_ff,
+            transformer_init="xavier_uniform",
+            transformer_input_layer="embed",
+            lsm_weight=0.01,
+            transformer_attn_dropout_rate=None,
+            transformer_length_normalized_loss=True,
+            transformer_encoder_pos_enc_layer_type="rel_pos",
+            transformer_encoder_selfattn_layer_type="rel_selfattn",
+            macaron_style=True,
+            use_cnn_module=True,
+            cnn_module_kernel=31,
+            mtlalpha=0.0,
+        )
+        model = T.E2E(idim, odim, args)
+    else:
+        model = None
+
+    x = torch.randint(1, idim - 1, size=(30, 5)).long()
+    ilens = torch.full((x.size(0),), x.size(1)).long()
+    data = []
+    for i in range(x.size(0)):
+        data.append(
+            (
+                "utt%d" % i,
+                {
+                    "input": [{"shape": [ilens[i], idim]}],
+                    "output": [{"shape": [ilens[i], idim]}],
+                },
+            )
+        )
+    return model, x, ilens, x, data
+
+
+def run_transformer_copy():
+    # copy task defined in http://nlp.seas.harvard.edu/2018/04/03/attention.html#results
+    d_model = 32
+    model, x, ilens, y, data = prepare_copy_task(d_model)
+    model.train()
+    if torch.cuda.is_available():
+        model.cuda()
+    optim = torch.optim.Adam(model.parameters(), 0.01)
+    max_acc = 0
+    for i in range(1000):
+        _, x, ilens, y, data = prepare_copy_task(None)
+        if torch.cuda.is_available():
+            x = x.cuda()
+            y = y.cuda()
+        loss = model(x, ilens, y)
+        optim.zero_grad()
+        loss.backward()
+        optim.step()
+        acc = model.acc
+        print(i, loss.item(), acc)
+        max_acc = max(acc, max_acc)
+        # attn_dict = model.calculate_all_attentions(x, ilens, y)
+        # T.plot_multi_head_attention(
+        #    data, attn_dict, "/tmp/espnet-test", "iter%d.png" % i
+        # )
+    assert max_acc > 0.9
+
+    model.cpu()
+    model.eval()
+    # test beam search
+    recog_args = argparse.Namespace(
+        beam_size=1, penalty=0.0, ctc_weight=0.0, maxlenratio=0, minlenratio=0, nbest=1
+    )
+    if torch.cuda.is_available():
+        x = x.cpu()
+        y = y.cpu()
+
+    with torch.no_grad():
+        print("===== greedy decoding =====")
+        for i in range(10):
+            nbest = model.recognize(x[i, : ilens[i]].numpy(), recog_args)
+            print("gold:", y[i].tolist())
+            print("pred:", nbest[0]["yseq"][1:-1])
+        print("===== beam search decoding =====")
+        recog_args.beam_size = 4
+        recog_args.nbest = 4
+        for i in range(10):
+            nbest = model.recognize(x[i, : ilens[i]].numpy(), recog_args)
+            print("gold:", y[i].tolist())
+            print("pred:", [n["yseq"][1:-1] for n in nbest])
+    # # test attention plot
+    # attn_dict = model.calculate_all_attentions(x[:3], ilens[:3], y[:3])
+    # T.plot_multi_head_attention(data, attn_dict, "/tmp/espnet-test")
+    # assert(False)
+
+
+def test_transformer_parallel():
+    if not torch.cuda.is_available():
+        return
+
+    args = make_arg()
+    model, x, ilens, y, data = prepare("pytorch", args)
+    model = torch.nn.DataParallel(model).cuda()
+    logging.debug(ilens)
+    # test acc is almost 100%
+    optim = torch.optim.Adam(model.parameters(), 0.02)
+    max_acc = 0.0
+    for i in range(40):
+        loss = model(x, torch.as_tensor(ilens), y)
+        optim.zero_grad()
+        acc = float(model.module.acc)
+        max_acc = max(acc, max_acc)
+        loss.mean().backward()
+        optim.step()
+        print(loss, acc)
+        # attn_dict = model.calculate_all_attentions(x, ilens, y)
+        # T.plot_multi_head_attention(
+        #    data, attn_dict, "/tmp/espnet-test", "iter%d.png" % i
+        # )
+    assert max_acc > 0.8
+
+
+if __name__ == "__main__":
+    run_transformer_copy()
diff --git a/test/test_e2e_mt_transformer.py b/test/test_e2e_mt_transformer.py
index 4d4276a7891..08019140ae7 100644
--- a/test/test_e2e_mt_transformer.py
+++ b/test/test_e2e_mt_transformer.py
@@ -43,11 +43,17 @@ def make_arg(**kwargs):
         dunits=16,
         sym_space="<space>",
         sym_blank="<blank>",
+        transformer_decoder_selfattn_layer_type="selfattn",
+        transformer_encoder_selfattn_layer_type="selfattn",
         transformer_init="pytorch",
         transformer_input_layer="conv2d",
         transformer_length_normalized_loss=True,
         report_bleu=False,
         lsm_weight=0.001,
+        wshare=4,
+        ldconv_encoder_kernel_length="21_23_25_27_29_31_33_35_37_39_41_43",
+        ldconv_decoder_kernel_length="11_13_15_17_19_21",
+        ldconv_usebias=False,
         char_list=["<blank>", "a", "e", "i", "o", "u"],
         tie_src_tgt_embedding=False,
         tie_classifier=False,
@@ -111,10 +117,31 @@ def test_transformer_mask(module):
     assert not numpy.isnan(a.attn[0, :, :3, :3].detach().numpy()).any()
 
 
+ldconv_lconv_args = dict(
+    transformer_decoder_selfattn_layer_type="lightconv",
+    transformer_encoder_selfattn_layer_type="lightconv",
+    wshare=4,
+    ldconv_encoder_kernel_length="5_7_11",
+    ldconv_decoder_kernel_length="3_7",
+    ldconv_usebias=False,
+)
+
+ldconv_dconv_args = dict(
+    transformer_decoder_selfattn_layer_type="dynamicconv",
+    transformer_encoder_selfattn_layer_type="dynamicconv",
+    wshare=4,
+    ldconv_encoder_kernel_length="5_7_11",
+    ldconv_decoder_kernel_length="3_7",
+    ldconv_usebias=False,
+)
+
+
 @pytest.mark.parametrize(
     "module, model_dict",
     [
         ("pytorch", {}),
+        ("pytorch", ldconv_lconv_args),
+        ("pytorch", ldconv_dconv_args),
         ("pytorch", {"report_bleu": True}),
         ("pytorch", {"tie_src_tgt_embedding": True}),
         ("pytorch", {"tie_classifier": True}),
diff --git a/test/test_e2e_st_transformer.py b/test/test_e2e_st_transformer.py
index ea3e9f93d6d..dbba982e64c 100644
--- a/test/test_e2e_st_transformer.py
+++ b/test/test_e2e_st_transformer.py
@@ -43,6 +43,8 @@ def make_arg(**kwargs):
         dunits=16,
         sym_space="<space>",
         sym_blank="<blank>",
+        transformer_decoder_selfattn_layer_type="selfattn",
+        transformer_encoder_selfattn_layer_type="selfattn",
         transformer_init="pytorch",
         transformer_input_layer="conv2d",
         transformer_length_normalized_loss=True,
@@ -51,6 +53,10 @@ def make_arg(**kwargs):
         report_wer=False,
         mtlalpha=0.0,  # for CTC-ASR
         lsm_weight=0.001,
+        wshare=4,
+        ldconv_encoder_kernel_length="21_23_25_27_29_31_33_35_37_39_41_43",
+        ldconv_decoder_kernel_length="11_13_15_17_19_21",
+        ldconv_usebias=False,
         char_list=["<blank>", "a", "e", "i", "o", "u"],
         ctc_type="warpctc",
         asr_weight=0.0,
@@ -124,10 +130,51 @@ def test_transformer_mask(module):
     assert not numpy.isnan(a.attn[0, :, :3, :3].detach().numpy()).any()
 
 
+ldconv_lconv_args = dict(
+    transformer_decoder_selfattn_layer_type="lightconv",
+    transformer_encoder_selfattn_layer_type="lightconv",
+    wshare=4,
+    ldconv_encoder_kernel_length="5_7_11",
+    ldconv_decoder_kernel_length="3_7",
+    ldconv_usebias=False,
+)
+
+ldconv_dconv_args = dict(
+    transformer_decoder_selfattn_layer_type="dynamicconv",
+    transformer_encoder_selfattn_layer_type="dynamicconv",
+    wshare=4,
+    ldconv_encoder_kernel_length="5_7_11",
+    ldconv_decoder_kernel_length="3_7",
+    ldconv_usebias=False,
+)
+
+ldconv_lconv2d_args = dict(
+    transformer_decoder_selfattn_layer_type="lightconv2d",
+    transformer_encoder_selfattn_layer_type="lightconv2d",
+    wshare=4,
+    ldconv_encoder_kernel_length="5_7_11",
+    ldconv_decoder_kernel_length="3_7",
+    ldconv_usebias=False,
+)
+
+ldconv_dconv2d_args = dict(
+    transformer_decoder_selfattn_layer_type="dynamicconv2d",
+    transformer_encoder_selfattn_layer_type="dynamicconv2d",
+    wshare=4,
+    ldconv_encoder_kernel_length="5_7_11",
+    ldconv_decoder_kernel_length="3_7",
+    ldconv_usebias=False,
+)
+
+
 @pytest.mark.parametrize(
     "module, model_dict",
     [
         ("pytorch", {"asr_weight": 0.0, "mt_weight": 0.0}),  # pure E2E-ST
+        ("pytorch", ldconv_lconv_args),
+        ("pytorch", ldconv_dconv_args),
+        ("pytorch", ldconv_lconv2d_args),
+        ("pytorch", ldconv_dconv2d_args),
         (
             "pytorch",
             {"asr_weight": 0.1, "mtlalpha": 0.0, "mt_weight": 0.0},
diff --git a/test_utils/integration_test_ctc_align_wav.bats b/test_utils/integration_test_ctc_align_wav.bats
index 0032f08a38f..96faf72d35b 100644
--- a/test_utils/integration_test_ctc_align_wav.bats
+++ b/test_utils/integration_test_ctc_align_wav.bats
@@ -10,8 +10,24 @@ setup() {
     cd ./egs/wsj/asr1/
     wav=../../../test_utils/ctc_align_test.wav
     transcription="THE SALE OF THE HOTELS IS PART OF HOLIDAY'S STRATEGY TO SELL OFF ASSETS AND CONCENTRATE ON PROPERTY MANAGEMENT"
-    model=wsj.transformer.v1
-    ../../../utils/ctc_align_wav.sh --stop-stage 2 --align_dir ${tmpdir} --models ${model} ${wav} "${transcription}"
+    model=wsj.transformer_small.v1
+
+    mkdir -p ${tmpdir}
+    dict_units=('<unk>' '!' '"' '&' "'" '(' ')' "*" ',' '-' '.' '/' ':' ';' '<*IN*>' \
+               '<*MR.*>' '<NOISE>' '<space>' '?' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' \
+               'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z' \
+               '_' '`' '{' '}' '~')
+    for u in ${!dict_units[@]}; do
+        echo "${dict_units[u]}" $((u+1))
+    done > ${tmpdir}/tr_units.txt
+    echo "batchsize: 0" > ${tmpdir}/align.yaml
+
+    ../../../utils/ctc_align_wav.sh \
+        --stop-stage 2 \
+        --models ${model} \
+        --align_dir ${tmpdir} \
+        --align_config ${tmpdir}/align.yaml \
+        --dict ${tmpdir}/tr_units.txt ${wav} "${transcription}"
 
     prefix="Alignment: "
 
diff --git a/utils/ctc_align_wav.sh b/utils/ctc_align_wav.sh
index c1744503b87..d87071b97e7 100755
--- a/utils/ctc_align_wav.sh
+++ b/utils/ctc_align_wav.sh
@@ -22,10 +22,6 @@ verbose=1      # verbose option
 do_delta=false
 cmvn=
 
-# rnnlm related
-use_lang_model=false
-lang_model=
-
 # decoding parameter
 align_model=
 align_config=
@@ -57,7 +53,7 @@ Example:
     rec -c 1 -r 16000 example.wav trim 0 5
 
     # Align using model name
-    $0 --models tedlium2.transformer.v1 example.wav
+    $0 --models tedlium2.transformer.v1 example.wav "example text"
 
     # Align using model file
     $0 --cmvn cmvn.ark --align_model model.acc.best --align_config conf/align.yaml example.wav
@@ -75,6 +71,7 @@ Available models:
     - commonvoice.transformer.v1
     - csj.transformer.v1
     - wsj.transformer.v1
+    - wsj.transformer_small.v1
 EOF
 )
 . utils/parse_options.sh || exit 1;
@@ -131,6 +128,7 @@ function download_models () {
         "commonvoice.transformer.v1") share_url="https://drive.google.com/open?id=1tWccl6aYU67kbtkm8jv5H6xayqg1rzjh" ;;
         "csj.transformer.v1") share_url="https://drive.google.com/open?id=120nUQcSsKeY5dpyMWw_kI33ooMRGT2uF" ;;
         "wsj.transformer.v1") share_url="https://drive.google.com/open?id=1Az-4H25uwnEFa4lENc-EKiPaWXaijcJp" ;;
+        "wsj.transformer_small.v1") share_url="https://drive.google.com/open?id=1jdEKbgWhLTxN_qP4xwE7mTOPmp7Ga--T" ;;
         *) echo "No such models: ${models}"; exit 1 ;;
     esac
 
@@ -159,7 +157,8 @@ if [ -z "${wav}" ]; then
 fi
 if [ -z "${dict}" ]; then
     download_models
-    dict=$(find ${download_dir}/${models}/data/lang_1char -name "*.txt" | head -n 1)
+    dict=$(find ${download_dir}/${models}/data/lang_*char -name "*.txt" | head -n 1) || \
+        (echo Error: Dictionary file could not be found. Please construct one by yourself following the egs/*/asr1/run.sh. && exit 1;)
 fi
 
 # Check file existence