diff --git a/egs2/TEMPLATE/enh1/README.md b/egs2/TEMPLATE/enh1/README.md
index 2d7e7aa542b..1b9984979d4 100644
--- a/egs2/TEMPLATE/enh1/README.md
+++ b/egs2/TEMPLATE/enh1/README.md
@@ -40,6 +40,7 @@ Format scp files such as `wav.scp`. The scp files include:
   + `spk{}.scp`: wav file list of speech reference signals. {} can be 1, 2, ..., depending on the number of speakers in the input signal in `wav.scp`.
   + `noise{}.scp` (optional): wav file list of noise reference signals. {} can be 1, 2, ..., depending on the number of noise types in the input signal in `wav.scp`. The file(s) are required when `--use_noise_ref true` is specified. Also related to the variable `noise_type_num`.
   + `dereverb{}.scp` (optional): wav file list of dereverberation reference signals (for training a dereverberation model). This file is required when `--use_dereverb_ref true` is specified. Also related to the variable `dereverb_ref_num`.
+  + `utt2category`: (optional) the category info of each utterance. This file can help the batch sampler to load the same category utterances in each batch. One usage case is that users want to load the simulation data and real data in different batches.
 
 #### Stage 4: Remove short data
 This stage is same as that in ASR recipe.
diff --git a/egs2/TEMPLATE/enh1/enh.sh b/egs2/TEMPLATE/enh1/enh.sh
index fcb4f324f15..e0985ce67a7 100755
--- a/egs2/TEMPLATE/enh1/enh.sh
+++ b/egs2/TEMPLATE/enh1/enh.sh
@@ -201,6 +201,9 @@ fi
 [ -z "${valid_set}" ] &&   { log "${help_message}"; log "Error: --valid_set is required"  ; exit 2; };
 [ -z "${test_sets}" ] && { log "${help_message}"; log "Error: --test_sets is required"; exit 2; };
 
+# Extra files for enhancement process
+utt_extra_files="utt2category"
+
 data_feats=${dumpdir}/raw
 
 
@@ -267,7 +270,7 @@ if ! "${skip_data_prep}"; then
 
            for factor in ${speed_perturb_factors}; do
                if [[ $(bc <<<"${factor} != 1.0") == 1 ]]; then
-                   scripts/utils/perturb_enh_data_dir_speed.sh "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}" "${_scp_list}"
+                   scripts/utils/perturb_enh_data_dir_speed.sh --utt_extra_files "${utt_extra_files}" "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}" "${_scp_list}"
                    _dirs+="data/${train_set}_sp${factor} "
                else
                    # If speed factor is 1, same as the original
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/perturb_enh_data_dir_speed.sh b/egs2/TEMPLATE/enh1/scripts/utils/perturb_enh_data_dir_speed.sh
index 1d0a0fc3c3b..04887e10f30 100755
--- a/egs2/TEMPLATE/enh1/scripts/utils/perturb_enh_data_dir_speed.sh
+++ b/egs2/TEMPLATE/enh1/scripts/utils/perturb_enh_data_dir_speed.sh
@@ -27,6 +27,9 @@
 export LC_ALL=C
 set -euo pipefail
 
+utt_extra_files=
+. utils/parse_options.sh
+
 if [[ $# != 4 ]]; then
     echo "Usage: perturb_data_dir_speed.sh <warping-factor> <srcdir> <destdir> <scp_files>"
     echo "e.g.:"
@@ -108,17 +111,15 @@ for scp_file in ${scp_files};do
   fi
 done
 
-if [[ -f ${srcdir}/text ]]; then
-    utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text
-fi
+for x in text utt2lang ${utt_extra_files}; do
+    if [[ -f ${srcdir}/${x} ]]; then
+        utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/${x} >"${destdir}"/${x}
+    fi
+done
 if [[ -f ${srcdir}/spk2gender ]]; then
     utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender
 fi
-if [[ -f ${srcdir}/utt2lang ]]; then
-    utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang
-fi
-
 rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null
 echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}"
-
+utils/fix_data_dir.sh "${destdir}"
 utils/validate_data_dir.sh --no-feats --no-text "${destdir}"
diff --git a/egs2/TEMPLATE/enh_asr1/cmd.sh b/egs2/TEMPLATE/enh_asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/TEMPLATE/enh_asr1/conf/fbank.conf b/egs2/TEMPLATE/enh_asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/TEMPLATE/enh_asr1/conf/pbs.conf b/egs2/TEMPLATE/enh_asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/TEMPLATE/enh_asr1/conf/pitch.conf b/egs2/TEMPLATE/enh_asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/TEMPLATE/enh_asr1/conf/queue.conf b/egs2/TEMPLATE/enh_asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/TEMPLATE/enh_asr1/conf/slurm.conf b/egs2/TEMPLATE/enh_asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/TEMPLATE/enh_asr1/db.sh b/egs2/TEMPLATE/enh_asr1/db.sh
new file mode 120000
index 00000000000..318d781d123
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/db.sh
@@ -0,0 +1 @@
+../asr1/db.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/enh_asr.sh b/egs2/TEMPLATE/enh_asr1/enh_asr.sh
new file mode 100755
index 00000000000..3d238938a57
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/enh_asr.sh
@@ -0,0 +1,1640 @@
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+min() {
+  local a b
+  a=$1
+  for b in "$@"; do
+      if [ "${b}" -le "${a}" ]; then
+          a="${b}"
+      fi
+  done
+  echo "${a}"
+}
+SECONDS=0
+
+# General configuration
+stage=1              # Processes starts from the specified stage.
+stop_stage=10000     # Processes is stopped at the specified stage.
+skip_data_prep=false # Skip data preparation stages.
+skip_train=false     # Skip training stages.
+skip_eval=false      # Skip decoding and evaluation stages.
+skip_upload_hf=true  # Skip uploading to hugging face stages.
+ngpu=1               # The number of gpus ("0" uses cpu, otherwise use gpu).
+num_nodes=1          # The number of nodes.
+nj=32                # The number of parallel jobs.
+inference_nj=32      # The number of parallel jobs in decoding.
+gpu_inference=false  # Whether to perform gpu decoding.
+dumpdir=dump         # Directory to dump features.
+expdir=exp           # Directory to save experiments.
+python=python3       # Specify python to execute espnet commands.
+
+# Data preparation related
+local_data_opts= # The options given to local/data.sh.
+
+# Speed perturbation related
+speed_perturb_factors=  # perturbation factors, e.g. "0.9 1.0 1.1" (separated by space).
+
+# Feature extraction related
+feats_type=raw       # Feature type (raw or fbank_pitch).
+audio_format=flac    # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
+fs=16k               # Sampling rate.
+min_wav_duration=0.1 # Minimum duration in second.
+max_wav_duration=20  # Maximum duration in second.
+
+# Tokenization related
+token_type=bpe      # Tokenization type (char or bpe).
+nbpe=30             # The number of BPE vocabulary.
+bpemode=unigram     # Mode of BPE (unigram or bpe).
+oov="<unk>"         # Out of vocabulary symbol.
+blank="<blank>"     # CTC blank symbol
+sos_eos="<sos/eos>" # sos and eos symbole
+bpe_input_sentence_size=100000000 # Size of input sentence for BPE.
+bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE
+bpe_char_cover=1.0  # character coverage when modeling BPE
+
+# Ngram model related
+use_ngram=false
+ngram_exp=
+ngram_num=3
+
+# Language model related
+use_lm=true       # Use language model for ASR decoding.
+lm_tag=           # Suffix to the result dir for language model training.
+lm_exp=           # Specify the directory path for LM experiment.
+                  # If this option is specified, lm_tag is ignored.
+lm_stats_dir=     # Specify the directory path for LM statistics.
+lm_config=        # Config for language model training.
+lm_args=          # Arguments for language model training, e.g., "--max_epoch 10".
+                  # Note that it will overwrite args in lm config.
+use_word_lm=false # Whether to use word language model.
+num_splits_lm=1   # Number of splitting for lm corpus.
+# shellcheck disable=SC2034
+word_vocab_size=10000 # Size of word vocabulary.
+
+# ASR model related
+enh_asr_tag=       # Suffix to the result dir for asr model training.
+enh_asr_exp=       # Specify the directory path for ASR experiment.
+                   # If this option is specified, enh_asr_tag is ignored.
+enh_asr_stats_dir= # Specify the directory path for ASR statistics.
+enh_asr_config=    # Config for asr model training.
+enh_asr_args=      # Arguments for asr model training, e.g., "--max_epoch 10".
+                   # Note that it will overwrite args in asr config.
+pretrained_model=          # Pretrained model to load
+ignore_init_mismatch=false # Ignore initial mismatch
+feats_normalize=global_mvn # Normalizaton layer type.
+num_splits_asr=1           # Number of splitting for lm corpus.
+
+# Upload model related
+hf_repo=
+
+# Decoding related
+use_k2=false      # Whether to use k2 based decoder
+batch_size=1
+inference_tag=    # Suffix to the result dir for decoding.
+inference_config= # Config for decoding.
+asr_inference_args= # Arguments for decoding, e.g., "--lm_weight 0.1".
+                    # Note that it will overwrite args in inference config.
+enh_inference_args="--normalize_output_wav true"
+inference_lm=valid.loss.ave.pth       # Language model path for decoding.
+inference_ngram=${ngram_num}gram.bin
+inference_enh_asr_model=valid.acc.ave.pth # ASR model path for decoding.
+                                          # e.g.
+                                          # inference_enh_asr_model=train.loss.best.pth
+                                          # inference_enh_asr_model=3epoch.pth
+                                          # inference_enh_asr_model=valid.acc.best.pth
+                                          # inference_enh_asr_model=valid.loss.ave.pth
+download_model= # Download a model from Model Zoo and use it for decoding.
+
+# Enhancement related arguments
+spk_num=1   # Number of speakers
+noise_type_num=1
+dereverb_ref_num=1
+# Evaluation related
+scoring_protocol="STOI SDR SAR SIR SI_SNR"
+ref_channel=0
+
+# Enh Training data related
+use_dereverb_ref=false
+use_noise_ref=false
+
+# [Task dependent] Set the datadir name created by local/data.sh
+train_set=       # Name of training set.
+valid_set=       # Name of validation set used for monitoring/tuning network training.
+test_sets=       # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
+bpe_train_text=  # Text file path of bpe training set.
+lm_train_text=   # Text file path of language model training set.
+lm_dev_text=     # Text file path of language model development set.
+lm_test_text=    # Text file path of language model evaluation set.
+nlsyms_txt=none  # Non-linguistic symbol list if existing.
+cleaner=none     # Text cleaner.
+g2p=none         # g2p method (needed if token_type=phn).
+lang=noinfo      # The language type of corpus.
+score_opts=                # The options given to sclite scoring
+local_score_opts=          # The options given to local/score.sh.
+enh_asr_speech_fold_length=800 # fold_length for speech data during ASR training.
+enh_asr_text_fold_length=150   # fold_length for text data during ASR training.
+lm_fold_length=150         # fold_length for LM training.
+
+help_message=$(cat << EOF
+Usage: $0 --train-set "<train_set_name>" --valid-set "<valid_set_name>" --test_sets "<test_set_names>"
+
+Options:
+    # General configuration
+    --stage          # Processes starts from the specified stage (default="${stage}").
+    --stop_stage     # Processes is stopped at the specified stage (default="${stop_stage}").
+    --skip_data_prep # Skip data preparation stages (default="${skip_data_prep}").
+    --skip_train     # Skip training stages (default="${skip_train}").
+    --skip_eval      # Skip decoding and evaluation stages (default="${skip_eval}").
+    --skip_upload_hf    # Skip packing and uploading stages (default="${skip_upload_hf}").
+    --ngpu           # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
+    --num_nodes      # The number of nodes (default="${num_nodes}").
+    --nj             # The number of parallel jobs (default="${nj}").
+    --inference_nj   # The number of parallel jobs in decoding (default="${inference_nj}").
+    --gpu_inference  # Whether to perform gpu decoding (default="${gpu_inference}").
+    --dumpdir        # Directory to dump features (default="${dumpdir}").
+    --expdir         # Directory to save experiments (default="${expdir}").
+    --python         # Specify python to execute espnet commands (default="${python}").
+
+    # Data preparation related
+    --local_data_opts # The options given to local/data.sh (default="${local_data_opts}").
+
+    # Speed perturbation related
+    --speed_perturb_factors # speed perturbation factors, e.g. "0.9 1.0 1.1" (separated by space, default="${speed_perturb_factors}").
+
+    # Feature extraction related
+    --feats_type       # Feature type (raw, fbank_pitch or extracted, default="${feats_type}").
+    --audio_format     # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw, default="${audio_format}").
+    --fs               # Sampling rate (default="${fs}").
+    --min_wav_duration # Minimum duration in second (default="${min_wav_duration}").
+    --max_wav_duration # Maximum duration in second (default="${max_wav_duration}").
+
+    # Tokenization related
+    --token_type              # Tokenization type (char or bpe, default="${token_type}").
+    --nbpe                    # The number of BPE vocabulary (default="${nbpe}").
+    --bpemode                 # Mode of BPE (unigram or bpe, default="${bpemode}").
+    --oov                     # Out of vocabulary symbol (default="${oov}").
+    --blank                   # CTC blank symbol (default="${blank}").
+    --sos_eos                 # sos and eos symbole (default="${sos_eos}").
+    --bpe_input_sentence_size # Size of input sentence for BPE (default="${bpe_input_sentence_size}").
+    --bpe_nlsyms              # Non-linguistic symbol list for sentencepiece, separated by a comma. (default="${bpe_nlsyms}").
+    --bpe_char_cover          # Character coverage when modeling BPE (default="${bpe_char_cover}").
+
+    # Language model related
+    --lm_tag          # Suffix to the result dir for language model training (default="${lm_tag}").
+    --lm_exp          # Specify the directory path for LM experiment.
+                      # If this option is specified, lm_tag is ignored (default="${lm_exp}").
+    --lm_stats_dir    # Specify the directory path for LM statistics (default="${lm_stats_dir}").
+    --lm_config       # Config for language model training (default="${lm_config}").
+    --lm_args         # Arguments for language model training (default="${lm_args}").
+                      # e.g., --lm_args "--max_epoch 10"
+                      # Note that it will overwrite args in lm config.
+    --use_word_lm     # Whether to use word language model (default="${use_word_lm}").
+    --word_vocab_size # Size of word vocabulary (default="${word_vocab_size}").
+    --num_splits_lm   # Number of splitting for lm corpus (default="${num_splits_lm}").
+
+    # ASR model related
+    --enh_asr_tag          # Suffix to the result dir for asr model training (default="${enh_asr_tag}").
+    --enh_asr_exp          # Specify the directory path for ASR experiment.
+                       # If this option is specified, enh_asr_tag is ignored (default="${enh_asr_exp}").
+    --enh_asr_stats_dir    # Specify the directory path for ASR statistics (default="${enh_asr_stats_dir}").
+    --enh_asr_config       # Config for asr model training (default="${enh_asr_config}").
+    --enh_asr_args         # Arguments for asr model training (default="${enh_asr_args}").
+                           # e.g., --enh_asr_args "--max_epoch 10"
+                           # Note that it will overwrite args in asr config.
+    --pretrained_model=          # Pretrained model to load (default="${pretrained_model}").
+    --ignore_init_mismatch=      # Ignore mismatch parameter init with pretrained model (default="${ignore_init_mismatch}").
+    --feats_normalize  # Normalizaton layer type (default="${feats_normalize}").
+    --num_splits_asr   # Number of splitting for lm corpus  (default="${num_splits_asr}").
+
+    # Decoding related
+    --inference_tag       # Suffix to the result dir for decoding (default="${inference_tag}").
+    --inference_config    # Config for decoding (default="${inference_config}").
+    --asr_inference_args      # Arguments for decoding (default="${asr_inference_args}").
+                              # e.g., --asr_inference_args "--lm_weight 0.1"
+                              # Note that it will overwrite args in inference config.
+    --enh_inference_args      # Arguments for enhancement (default="${enh_inference_args}").
+    --inference_lm        # Language model path for decoding (default="${inference_lm}").
+    --inference_enh_asr_model # ASR model path for decoding (default="${inference_enh_asr_model}").
+    --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
+
+    --spk_num             # number of speakers
+    --noise_type_num   # Number of noise types in the input audio (default="${noise_type_num}")
+    --dereverb_ref_num # Number of references for dereverberation (default="${dereverb_ref_num}")
+    --use_dereverb_ref # Whether or not to use dereverberated signal as an additional reference
+                         for training a dereverberation model (default="${use_dereverb_ref}")
+    --use_noise_ref    # Whether or not to use noise signal as an additional reference
+                         for training a denoising model (default="${use_noise_ref}")
+    # Enhancement Evaluation related
+    --scoring_protocol    # Metrics to be used for scoring (default="${scoring_protocol}")
+    --ref_channel         # Reference channel of the reference speech will be used if the model
+                            output is single-channel and reference speech is multi-channel
+                            (default="${ref_channel}")
+    # [Task dependent] Set the datadir name created by local/data.sh
+    --train_set     # Name of training set (required).
+    --valid_set     # Name of validation set used for monitoring/tuning network training (required).
+    --test_sets     # Names of test sets.
+                    # Multiple items (e.g., both dev and eval sets) can be specified (required).
+    --bpe_train_text # Text file path of bpe training set.
+    --lm_train_text  # Text file path of language model training set.
+    --lm_dev_text   # Text file path of language model development set (default="${lm_dev_text}").
+    --lm_test_text  # Text file path of language model evaluation set (default="${lm_test_text}").
+    --nlsyms_txt    # Non-linguistic symbol list if existing (default="${nlsyms_txt}").
+    --cleaner       # Text cleaner (default="${cleaner}").
+    --g2p           # g2p method (default="${g2p}").
+    --lang          # The language type of corpus (default=${lang}).
+    --score_opts             # The options given to sclite scoring (default="{score_opts}").
+    --local_score_opts       # The options given to local/score.sh (default="{local_score_opts}").
+    --enh_asr_speech_fold_length # fold_length for speech data during ASR training (default="${enh_asr_speech_fold_length}").
+    --enh_asr_text_fold_length   # fold_length for text data during ASR training (default="${enh_asr_text_fold_length}").
+    --lm_fold_length         # fold_length for LM training (default="${lm_fold_length}").
+EOF
+)
+
+log "$0 $*"
+# Save command line args for logging (they will be lost after utils/parse_options.sh)
+run_args=$(pyscripts/utils/print_args.py $0 "$@")
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "${help_message}"
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+# Check required arguments
+[ -z "${train_set}" ] && { log "${help_message}"; log "Error: --train_set is required"; exit 2; };
+[ -z "${valid_set}" ] && { log "${help_message}"; log "Error: --valid_set is required"; exit 2; };
+[ -z "${test_sets}" ] && { log "${help_message}"; log "Error: --test_sets is required"; exit 2; };
+
+[ ${spk_num} -gt 1 ] && { log "${help_message}"; log "Error: --spk_num only 1 is supported"; exit 2; };
+
+# Check feature type
+if [ "${feats_type}" = raw ]; then
+    data_feats=${dumpdir}/raw
+elif [ "${feats_type}" = fbank_pitch ]; then
+    data_feats=${dumpdir}/fbank_pitch
+elif [ "${feats_type}" = fbank ]; then
+    data_feats=${dumpdir}/fbank
+elif [ "${feats_type}" == extracted ]; then
+    data_feats=${dumpdir}/extracted
+else
+    log "${help_message}"
+    log "Error: not supported: --feats_type ${feats_type}"
+    exit 2
+fi
+
+# Extra files for enhancement process
+utt_extra_files="utt2category text utt2lang"
+
+# Use the same text as ASR for bpe training if not specified.
+[ -z "${bpe_train_text}" ] && bpe_train_text="${data_feats}/${train_set}/text"
+# Use the same text as ASR for lm training if not specified.
+[ -z "${lm_train_text}" ] && lm_train_text="${data_feats}/${train_set}/text"
+# Use the same text as ASR for lm training if not specified.
+[ -z "${lm_dev_text}" ] && lm_dev_text="${data_feats}/${valid_set}/text"
+# Use the text of the 1st evaldir if lm_test is not specified
+[ -z "${lm_test_text}" ] && lm_test_text="${data_feats}/${test_sets%% *}/text"
+
+# Check tokenization type
+if [ "${lang}" != noinfo ]; then
+    token_listdir=data/${lang}_token_list
+else
+    token_listdir=data/token_list
+fi
+bpedir="${token_listdir}/bpe_${bpemode}${nbpe}"
+bpeprefix="${bpedir}"/bpe
+bpemodel="${bpeprefix}".model
+bpetoken_list="${bpedir}"/tokens.txt
+chartoken_list="${token_listdir}"/char/tokens.txt
+# NOTE: keep for future development.
+# shellcheck disable=SC2034
+wordtoken_list="${token_listdir}"/word/tokens.txt
+
+if [ "${token_type}" = bpe ]; then
+    token_list="${bpetoken_list}"
+elif [ "${token_type}" = char ]; then
+    token_list="${chartoken_list}"
+    bpemodel=none
+elif [ "${token_type}" = word ]; then
+    token_list="${wordtoken_list}"
+    bpemodel=none
+else
+    log "Error: not supported --token_type '${token_type}'"
+    exit 2
+fi
+if ${use_word_lm}; then
+    log "Error: Word LM is not supported yet"
+    exit 2
+
+    lm_token_list="${wordtoken_list}"
+    lm_token_type=word
+else
+    lm_token_list="${token_list}"
+    lm_token_type="${token_type}"
+fi
+
+
+# Set tag for naming of model directory
+if [ -z "${enh_asr_tag}" ]; then
+    if [ -n "${enh_asr_config}" ]; then
+        enh_asr_tag="$(basename "${enh_asr_config}" .yaml)_${feats_type}"
+    else
+        enh_asr_tag="train_${feats_type}"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        enh_asr_tag+="_${lang}_${token_type}"
+    else
+        enh_asr_tag+="_${token_type}"
+    fi
+    if [ "${token_type}" = bpe ]; then
+        enh_asr_tag+="${nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${enh_asr_args}" ]; then
+        enh_asr_tag+="$(echo "${enh_asr_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        enh_asr_tag+="_sp"
+    fi
+fi
+if [ -z "${lm_tag}" ]; then
+    if [ -n "${lm_config}" ]; then
+        lm_tag="$(basename "${lm_config}" .yaml)"
+    else
+        lm_tag="train"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        lm_tag+="_${lang}_${lm_token_type}"
+    else
+        lm_tag+="_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_tag+="${nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${lm_args}" ]; then
+        lm_tag+="$(echo "${lm_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+fi
+
+# The directory used for collect-stats mode
+if [ -z "${enh_asr_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        enh_asr_stats_dir="${expdir}/enh_asr_stats_${feats_type}_${lang}_${token_type}"
+    else
+        enh_asr_stats_dir="${expdir}/enh_asr_stats_${feats_type}_${token_type}"
+    fi
+    if [ "${token_type}" = bpe ]; then
+        enh_asr_stats_dir+="${nbpe}"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        enh_asr_stats_dir+="_sp"
+    fi
+fi
+if [ -z "${lm_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        lm_stats_dir="${expdir}/lm_stats_${lang}_${lm_token_type}"
+    else
+        lm_stats_dir="${expdir}/lm_stats_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_stats_dir+="${nbpe}"
+    fi
+fi
+# The directory used for training commands
+if [ -z "${enh_asr_exp}" ]; then
+    enh_asr_exp="${expdir}/enh_asr_${enh_asr_tag}"
+fi
+if [ -z "${lm_exp}" ]; then
+    lm_exp="${expdir}/lm_${lm_tag}"
+fi
+if [ -z "${ngram_exp}" ]; then
+    ngram_exp="${expdir}/ngram"
+fi
+
+
+if [ -z "${inference_tag}" ]; then
+    if [ -n "${inference_config}" ]; then
+        inference_tag="$(basename "${inference_config}" .yaml)"
+    else
+        inference_tag=inference
+    fi
+    # Add overwritten arg's info
+    if [ -n "${asr_inference_args}" ]; then
+        inference_tag+="$(echo "${asr_inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if [ -n "${enh_inference_args}" ]; then
+        inference_tag+="$(echo "${enh_inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if "${use_lm}"; then
+        inference_tag+="_lm_$(basename "${lm_exp}")_$(echo "${inference_lm}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    if "${use_ngram}"; then
+        inference_tag+="_ngram_$(basename "${ngram_exp}")_$(echo "${inference_ngram}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    inference_tag+="_enh_asr_model_$(echo "${inference_enh_asr_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+
+    if "${use_k2}"; then
+      inference_tag+="_use_k2"
+    fi
+fi
+
+# ========================== Main stages start from here. ==========================
+
+if ! "${skip_data_prep}"; then
+    if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+        log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc."
+        # [Task dependent] Need to create data.sh for new corpus
+        local/data.sh ${local_data_opts}
+    fi
+
+    if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+        if [ -n "${speed_perturb_factors}" ]; then
+           log "Stage 2: Speed perturbation: data/${train_set} -> data/${train_set}_sp"
+
+            _scp_list="wav.scp "
+            for i in $(seq ${spk_num}); do
+                _scp_list+="spk${i}.scp "
+            done
+
+           for factor in ${speed_perturb_factors}; do
+               if [[ $(bc <<<"${factor} != 1.0") == 1 ]]; then
+                   scripts/utils/perturb_enh_data_dir_speed.sh --utt_extra_files "${utt_extra_files}" "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}" "${_scp_list}"
+                   _dirs+="data/${train_set}_sp${factor} "
+               else
+                   # If speed factor is 1, same as the original
+                   _dirs+="data/${train_set} "
+               fi
+           done
+           utils/combine_data.sh --extra-files "${_scp_list}" "data/${train_set}_sp" ${_dirs}
+           for extra_file in ${utt_extra_files}; do
+               python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp
+               mv data/"${train_set}_sp"/${extra_file}.tmp data/"${train_set}_sp"/${extra_file}
+            done
+        else
+           log "Skip stage 2: Speed perturbation"
+        fi
+    fi
+
+    if [ -n "${speed_perturb_factors}" ]; then
+        train_set="${train_set}_sp"
+    fi
+
+    if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+        if [ "${feats_type}" = raw ]; then
+            log "Stage 3: Format wav.scp: data/ -> ${data_feats}"
+
+            # ====== Recreating "wav.scp" ======
+            # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
+            # shouldn't be used in training process.
+            # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
+            # and it can also change the audio-format and sampling rate.
+            # If nothing is need, then format_wav_scp.sh does nothing:
+            # i.e. the input file format and rate is same as the output.
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                # expand the utt_extra_files for multi-references
+                expand_utt_extra_files=""
+                for extra_file in ${utt_extra_files}; do
+                    # with regex to suuport multi-references
+                    for single_file in $(ls data/"${dset}"/${extra_file}*); do
+                        cp ${single_file} "${data_feats}${_suf}/${dset}"
+                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
+                    done
+                done
+                echo "${expand_utt_extra_files}"
+
+                rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel,reco2dur}
+                _opts=
+                if [ -e data/"${dset}"/segments ]; then
+                    # "segments" is used for splitting wav files which are written in "wav".scp
+                    # into utterances. The file format of segments:
+                    #   <segment_id> <record_id> <start_time> <end_time>
+                    #   "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5"
+                    # Where the time is written in seconds.
+                    _opts+="--segments data/${dset}/segments "
+                fi
+
+                _spk_list=" "
+                for i in $(seq ${spk_num}); do
+                    _spk_list+="spk${i} "
+                done
+                if $use_noise_ref && [ -n "${_suf}" ]; then
+                    # references for denoising ("noise1 noise2 ... niose${noise_type_num} ")
+                    _spk_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n "; done)
+                fi
+                if $use_dereverb_ref && [ -n "${_suf}" ]; then
+                    # references for dereverberation
+                    _spk_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n "; done)
+                fi
+
+                for spk in ${_spk_list} "wav" ; do
+                    # shellcheck disable=SC2086
+                    scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                        --out-filename "${spk}.scp" \
+                        --ref_channels "0" \
+                        --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
+                        "data/${dset}/${spk}.scp" "${data_feats}${_suf}/${dset}" \
+                        "${data_feats}${_suf}/${dset}/logs/${spk}" "${data_feats}${_suf}/${dset}/data/${spk}"
+
+                done
+
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+
+            done
+
+        else
+            log "Error: not supported: --feats_type ${feats_type}"
+            exit 2
+        fi
+    fi
+
+
+    if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+        log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}"
+
+        # NOTE(kamo): Not applying to test_sets to keep original data
+        for dset in "${train_set}" "${valid_set}"; do
+
+            _spk_list=" "
+            _scp_list=" "
+            for i in $(seq ${spk_num}); do
+                _spk_list+="spk${i} "
+                _scp_list+="spk${i}.scp "
+            done
+            if $use_noise_ref; then
+                # references for denoising ("noise1 noise2 ... niose${noise_type_num} ")
+                _spk_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n "; done)
+                _scp_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n.scp "; done)
+            fi
+            if $use_dereverb_ref; then
+                # references for dereverberation
+                _spk_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n "; done)
+                _scp_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n.scp "; done)
+            fi
+
+            # Copy data dir
+            utils/copy_data_dir.sh --validate_opts --non-print "${data_feats}/org/${dset}" "${data_feats}/${dset}"
+            cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
+
+            for utt_extra_file in ${utt_extra_files}; do
+                if [ -f "${data_feats}/org/${dset}/${utt_extra_file}" ]; then
+                    cp "${data_feats}/org/${dset}/${utt_extra_file}" "${data_feats}/${dset}"
+                fi
+            done
+
+            # Remove short utterances
+            _feats_type="$(<${data_feats}/${dset}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
+                _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
+
+                # utt2num_samples is created by format_wav_scp.sh
+                <"${data_feats}/org/${dset}/utt2num_samples" \
+                    awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
+                        >"${data_feats}/${dset}/utt2num_samples"
+                for spk in ${_spk_list} "wav"; do
+                    <"${data_feats}/org/${dset}/${spk}.scp" \
+                        utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
+                        >"${data_feats}/${dset}/${spk}.scp"
+                done
+            else
+                # Get frame shift in ms from conf/fbank.conf
+                _frame_shift=
+                if [ -f conf/fbank.conf ] && [ "$(<conf/fbank.conf grep -c frame-shift)" -gt 0 ]; then
+                    # Assume using conf/fbank.conf for feature extraction
+                    _frame_shift="$(<conf/fbank.conf grep frame-shift | sed -e 's/[-a-z =]*\([0-9]*\)/\1/g')"
+                fi
+                if [ -z "${_frame_shift}" ]; then
+                    # If not existing, use the default number in Kaldi (=10ms).
+                    # If you are using different number, you have to change the following value manually.
+                    _frame_shift=10
+                fi
+
+                _min_length=$(python3 -c "print(int(${min_wav_duration} / ${_frame_shift} * 1000))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} / ${_frame_shift} * 1000))")
+
+                cp "${data_feats}/org/${dset}/feats_dim" "${data_feats}/${dset}/feats_dim"
+                <"${data_feats}/org/${dset}/feats_shape" awk -F, ' { print $1 } ' \
+                    | awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length) print $0; }' \
+                        >"${data_feats}/${dset}/feats_shape"
+                <"${data_feats}/org/${dset}/feats.scp" \
+                    utils/filter_scp.pl "${data_feats}/${dset}/feats_shape"  \
+                    >"${data_feats}/${dset}/feats.scp"
+            fi
+
+            # Remove empty text
+            <"${data_feats}/org/${dset}/text" \
+                awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
+
+            # fix_data_dir.sh leaves only utts which exist in all files
+            utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" "${data_feats}/${dset}"
+        done
+
+        # shellcheck disable=SC2002
+        cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.txt"
+    fi
+
+
+    if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+        if [ "${token_type}" = bpe ]; then
+            log "Stage 5: Generate token_list from ${bpe_train_text} using BPE"
+
+            mkdir -p "${bpedir}"
+            # shellcheck disable=SC2002
+            cat ${bpe_train_text} | cut -f 2- -d" "  > "${bpedir}"/train.txt
+
+            if [ -n "${bpe_nlsyms}" ]; then
+                _opts_spm="--user_defined_symbols=${bpe_nlsyms}"
+            else
+                _opts_spm=""
+            fi
+
+            spm_train \
+                --input="${bpedir}"/train.txt \
+                --vocab_size="${nbpe}" \
+                --model_type="${bpemode}" \
+                --model_prefix="${bpeprefix}" \
+                --character_coverage=${bpe_char_cover} \
+                --input_sentence_size="${bpe_input_sentence_size}" \
+                ${_opts_spm}
+
+            {
+            echo "${blank}"
+            echo "${oov}"
+            # Remove <unk>, <s>, </s> from the vocabulary
+            <"${bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+            echo "${sos_eos}"
+            } > "${token_list}"
+
+        elif [ "${token_type}" = char ] || [ "${token_type}" = word ]; then
+            log "Stage 5: Generate character level token_list from ${lm_train_text}"
+
+            _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+            # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+            # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
+            ${python} -m espnet2.bin.tokenize_text  \
+                --token_type "${token_type}" \
+                --input "${data_feats}/lm_train.txt" --output "${token_list}" ${_opts} \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+
+        else
+            log "Error: not supported --token_type '${token_type}'"
+            exit 2
+        fi
+
+        # Create word-list for word-LM training
+        if ${use_word_lm} && [ "${token_type}" != word ]; then
+            log "Generate word level token_list from ${data_feats}/lm_train.txt"
+            ${python} -m espnet2.bin.tokenize_text \
+                --token_type word \
+                --input "${data_feats}/lm_train.txt" --output "${lm_token_list}" \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --vocabulary_size "${word_vocab_size}" \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+        fi
+
+    fi
+else
+    log "Skip the stages for data preparation"
+fi
+
+
+# ========================== Data preparation is done here. ==========================
+
+
+if ! "${skip_train}"; then
+    if "${use_lm}"; then
+        if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+            log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            # 1. Split the key file
+            _logdir="${lm_stats_dir}/logdir"
+            mkdir -p "${_logdir}"
+            # Get the minimum number among ${nj} and the number lines of input files
+            _nj=$(min "${nj}" "$(<${data_feats}/lm_train.txt wc -l)" "$(<${lm_dev_text} wc -l)")
+
+            key_file="${data_feats}/lm_train.txt"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/train.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            key_file="${lm_dev_text}"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/dev.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Generate run.sh
+            log "Generate '${lm_stats_dir}/run.sh'. You can resume the process from stage 6 using this script"
+            mkdir -p "${lm_stats_dir}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${lm_stats_dir}/run.sh"; chmod +x "${lm_stats_dir}/run.sh"
+
+            # 3. Submit jobs
+            log "LM collect-stats started... log: '${_logdir}/stats.*.log'"
+            # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+            #       but it's used only for deciding the sample ids.
+            # shellcheck disable=SC2086
+            ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+                ${python} -m espnet2.bin.lm_train \
+                    --collect_stats true \
+                    --use_preprocessor true \
+                    --bpemodel "${bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --train_data_path_and_name_and_type "${data_feats}/lm_train.txt,text,text" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --train_shape_file "${_logdir}/train.JOB.scp" \
+                    --valid_shape_file "${_logdir}/dev.JOB.scp" \
+                    --output_dir "${_logdir}/stats.JOB" \
+                    ${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+            # 4. Aggregate shape files
+            _opts=
+            for i in $(seq "${_nj}"); do
+                _opts+="--input_dir ${_logdir}/stats.${i} "
+            done
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"
+
+            # Append the num-tokens at the last dimensions. This is used for batch-bins count
+            <"${lm_stats_dir}/train/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/train/text_shape.${lm_token_type}"
+
+            <"${lm_stats_dir}/valid/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/valid/text_shape.${lm_token_type}"
+        fi
+
+
+        if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+            log "Stage 7: LM Training: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            if [ "${num_splits_lm}" -gt 1 ]; then
+                # If you met a memory error when parsing text files, this option may help you.
+                # The corpus is split into subsets and each subset is used for training one by one in order,
+                # so the memory footprint can be limited to the memory required for each dataset.
+
+                _split_dir="${lm_stats_dir}/splits${num_splits_lm}"
+                if [ ! -f "${_split_dir}/.done" ]; then
+                    rm -f "${_split_dir}/.done"
+                    ${python} -m espnet2.bin.split_scps \
+                      --scps "${data_feats}/lm_train.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
+                      --num_splits "${num_splits_lm}" \
+                      --output_dir "${_split_dir}"
+                    touch "${_split_dir}/.done"
+                else
+                    log "${_split_dir}/.done exists. Spliting is skipped"
+                fi
+
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
+                _opts+="--multiple_iterator true "
+
+            else
+                _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
+            fi
+
+            # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+
+            log "Generate '${lm_exp}/run.sh'. You can resume the process from stage 7 using this script"
+            mkdir -p "${lm_exp}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${lm_exp}/run.sh"; chmod +x "${lm_exp}/run.sh"
+
+            log "LM training started... log: '${lm_exp}/train.log'"
+            if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+                # SGE can't include "/" in a job name
+                jobname="$(basename ${lm_exp})"
+            else
+                jobname="${lm_exp}/train.log"
+            fi
+
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.launch \
+                --cmd "${cuda_cmd} --name ${jobname}" \
+                --log "${lm_exp}"/train.log \
+                --ngpu "${ngpu}" \
+                --num_nodes "${num_nodes}" \
+                --init_file_prefix "${lm_exp}"/.dist_init_ \
+                --multiprocessing_distributed true -- \
+                ${python} -m espnet2.bin.lm_train \
+                    --ngpu "${ngpu}" \
+                    --use_preprocessor true \
+                    --bpemodel "${bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --valid_shape_file "${lm_stats_dir}/valid/text_shape.${lm_token_type}" \
+                    --fold_length "${lm_fold_length}" \
+                    --resume true \
+                    --output_dir "${lm_exp}" \
+                    ${_opts} ${lm_args}
+
+        fi
+
+
+        if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+            log "Stage 8: Calc perplexity: ${lm_test_text}"
+            _opts=
+            # TODO(kamo): Parallelize?
+            log "Perplexity calculation started... log: '${lm_exp}/perplexity_test/lm_calc_perplexity.log'"
+            # shellcheck disable=SC2086
+            ${cuda_cmd} --gpu "${ngpu}" "${lm_exp}"/perplexity_test/lm_calc_perplexity.log \
+                ${python} -m espnet2.bin.lm_calc_perplexity \
+                    --ngpu "${ngpu}" \
+                    --data_path_and_name_and_type "${lm_test_text},text,text" \
+                    --train_config "${lm_exp}"/config.yaml \
+                    --model_file "${lm_exp}/${inference_lm}" \
+                    --output_dir "${lm_exp}/perplexity_test" \
+                    ${_opts}
+            log "PPL: ${lm_test_text}: $(cat ${lm_exp}/perplexity_test/ppl)"
+
+        fi
+
+    else
+        log "Stage 6-8: Skip lm-related stages: use_lm=${use_lm}"
+    fi
+
+
+    if "${use_ngram}"; then
+        mkdir -p ${ngram_exp}
+    fi
+    if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+        if "${use_ngram}"; then
+            log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt"
+            cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
+            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin 
+        else
+            log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
+        fi
+    fi
+
+
+    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+        _enh_asr_train_dir="${data_feats}/${train_set}"
+        _enh_asr_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 10: ASR collect stats: train_set=${_enh_asr_train_dir}, valid_set=${_enh_asr_valid_dir}"
+
+        _opts=
+        if [ -n "${enh_asr_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.enh_s2t_train --print_config --optim adam
+            _opts+="--config ${enh_asr_config} "
+        fi
+
+        _feats_type="$(<${_enh_asr_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                # "sound" supports "wav", "flac", etc.
+                _type=sound
+            fi
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _input_size="$(<${_enh_asr_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+        fi
+
+        # 1. Split the key file
+        _logdir="${enh_asr_stats_dir}/logdir"
+        mkdir -p "${_logdir}"
+
+        # Get the minimum number among ${nj} and the number lines of input files
+        _nj=$(min "${nj}" "$(<${_enh_asr_train_dir}/${_scp} wc -l)" "$(<${_enh_asr_valid_dir}/${_scp} wc -l)")
+
+        key_file="${_enh_asr_train_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/train.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        key_file="${_enh_asr_valid_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/valid.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        # 2. Generate run.sh
+        log "Generate '${enh_asr_stats_dir}/run.sh'. You can resume the process from stage 10 using this script"
+        mkdir -p "${enh_asr_stats_dir}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${enh_asr_stats_dir}/run.sh"; chmod +x "${enh_asr_stats_dir}/run.sh"
+
+        # 3. Submit jobs
+        log "ASR collect-stats started... log: '${_logdir}/stats.*.log'"
+
+        # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+        #       but it's used only for deciding the sample ids.
+
+        # shellcheck disable=SC2086
+        ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+            ${python} -m espnet2.bin.enh_s2t_train \
+                --collect_stats true \
+                --use_preprocessor true \
+                --bpemodel "${bpemodel}" \
+                --token_type "${token_type}" \
+                --token_list "${token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --train_data_path_and_name_and_type "${_enh_asr_train_dir}/${_scp},speech,${_type}" \
+                --train_data_path_and_name_and_type "${_enh_asr_train_dir}/${_scp},speech_ref1,${_type}" \
+                --train_data_path_and_name_and_type "${_enh_asr_train_dir}/text,text,text" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/${_scp},speech_ref1,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/text,text,text" \
+                --train_shape_file "${_logdir}/train.JOB.scp" \
+                --valid_shape_file "${_logdir}/valid.JOB.scp" \
+                --output_dir "${_logdir}/stats.JOB" \
+                ${_opts} ${enh_asr_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+        # 4. Aggregate shape files
+        _opts=
+        for i in $(seq "${_nj}"); do
+            _opts+="--input_dir ${_logdir}/stats.${i} "
+        done
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${enh_asr_stats_dir}"
+
+        # Append the num-tokens at the last dimensions. This is used for batch-bins count
+        <"${enh_asr_stats_dir}/train/text_shape" \
+            awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_asr_stats_dir}/train/text_shape.${token_type}"
+
+        <"${enh_asr_stats_dir}/valid/text_shape" \
+            awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_asr_stats_dir}/valid/text_shape.${token_type}"
+    fi
+
+
+    if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then
+        _enh_asr_train_dir="${data_feats}/${train_set}"
+        _enh_asr_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 11: ASR Training: train_set=${_enh_asr_train_dir}, valid_set=${_enh_asr_valid_dir}"
+
+        _opts=
+        if [ -n "${enh_asr_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.enh_s2t_train --print_config --optim adam
+            _opts+="--config ${enh_asr_config} "
+        fi
+
+        _feats_type="$(<${_enh_asr_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            # "sound" supports "wav", "flac", etc.
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                _type=sound
+            fi
+            _fold_length="$((enh_asr_speech_fold_length * 100))"
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _fold_length="${enh_asr_speech_fold_length}"
+            _input_size="$(<${_enh_asr_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            # Default normalization is utterance_mvn and changes to global_mvn
+            _opts+="--normalize=global_mvn --normalize_conf stats_file=${enh_asr_stats_dir}/train/feats_stats.npz "
+        fi
+
+        if [ "${num_splits_asr}" -gt 1 ]; then
+            # If you met a memory error when parsing text files, this option may help you.
+            # The corpus is split into subsets and each subset is used for training one by one in order,
+            # so the memory footprint can be limited to the memory required for each dataset.
+
+            _split_dir="${enh_asr_stats_dir}/splits${num_splits_asr}"
+            if [ ! -f "${_split_dir}/.done" ]; then
+                rm -f "${_split_dir}/.done"
+                ${python} -m espnet2.bin.split_scps \
+                  --scps \
+                      "${_enh_asr_train_dir}/${_scp}" \
+                      "${_enh_asr_train_dir}/text" \
+                      "${enh_asr_stats_dir}/train/speech_shape" \
+                      "${enh_asr_stats_dir}/train/speech_ref1_shape" \
+                      "${enh_asr_stats_dir}/train/text_shape.${token_type}" \
+                  --num_splits "${num_splits_asr}" \
+                  --output_dir "${_split_dir}"
+                touch "${_split_dir}/.done"
+            else
+                log "${_split_dir}/.done exists. Spliting is skipped"
+            fi
+
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/spk1.scp,speech_ref1,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text,text,text "
+            _opts+="--train_shape_file ${_split_dir}/speech_shape "
+            _opts+="--train_shape_file ${_split_dir}/speech_ref1_shape "
+            _opts+="--train_shape_file ${_split_dir}/text_shape.${token_type} "
+            _opts+="--multiple_iterator true "
+
+        else
+            _opts+="--train_data_path_and_name_and_type ${_enh_asr_train_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_enh_asr_train_dir}/spk1.scp,speech_ref1,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_enh_asr_train_dir}/text,text,text "
+            _opts+="--train_shape_file ${enh_asr_stats_dir}/train/speech_shape "
+            _opts+="--train_shape_file ${enh_asr_stats_dir}/train/speech_ref1_shape "
+            _opts+="--train_shape_file ${enh_asr_stats_dir}/train/text_shape.${token_type} "
+        fi
+
+        log "Generate '${enh_asr_exp}/run.sh'. You can resume the process from stage 11 using this script"
+        mkdir -p "${enh_asr_exp}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${enh_asr_exp}/run.sh"; chmod +x "${enh_asr_exp}/run.sh"
+
+        # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+        log "ASR training started... log: '${enh_asr_exp}/train.log'"
+        if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+            # SGE can't include "/" in a job name
+            jobname="$(basename ${enh_asr_exp})"
+        else
+            jobname="${enh_asr_exp}/train.log"
+        fi
+
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.launch \
+            --cmd "${cuda_cmd} --name ${jobname}" \
+            --log "${enh_asr_exp}"/train.log \
+            --ngpu "${ngpu}" \
+            --num_nodes "${num_nodes}" \
+            --init_file_prefix "${enh_asr_exp}"/.dist_init_ \
+            --multiprocessing_distributed true -- \
+            ${python} -m espnet2.bin.enh_s2t_train \
+                --use_preprocessor true \
+                --bpemodel "${bpemodel}" \
+                --token_type "${token_type}" \
+                --token_list "${token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/spk1.scp,speech_ref1,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/text,text,text" \
+                --valid_shape_file "${enh_asr_stats_dir}/valid/speech_shape" \
+                --valid_shape_file "${enh_asr_stats_dir}/valid/speech_ref1_shape" \
+                --valid_shape_file "${enh_asr_stats_dir}/valid/text_shape.${token_type}" \
+                --resume true \
+                --init_param ${pretrained_model} \
+                --ignore_init_mismatch ${ignore_init_mismatch} \
+                --fold_length "${_fold_length}" \
+                --fold_length "${_fold_length}" \
+                --fold_length "${enh_asr_text_fold_length}" \
+                --output_dir "${enh_asr_exp}" \
+                ${_opts} ${enh_asr_args}
+
+    fi
+else
+    log "Skip the training stages"
+fi
+
+
+if [ -n "${download_model}" ]; then
+    log "Use ${download_model} for decoding and evaluation"
+    enh_asr_exp="${expdir}/${download_model}"
+    mkdir -p "${enh_asr_exp}"
+
+    # If the model already exists, you can skip downloading
+    espnet_model_zoo_download --unpack true "${download_model}" > "${enh_asr_exp}/config.txt"
+
+    # Get the path of each file
+    _enh_asr_model_file=$(<"${enh_asr_exp}/config.txt" sed -e "s/.*'enh_s2t_model_file': '\([^']*\)'.*$/\1/")
+    _enh_asr_train_config=$(<"${enh_asr_exp}/config.txt" sed -e "s/.*'enh_s2t_train_config': '\([^']*\)'.*$/\1/")
+
+    # Create symbolic links
+    ln -sf "${_enh_asr_model_file}" "${enh_asr_exp}"
+    ln -sf "${_enh_asr_train_config}" "${enh_asr_exp}"
+    inference_enh_asr_model=$(basename "${_enh_asr_model_file}")
+
+    if [ "$(<${enh_asr_exp}/config.txt grep -c lm_file)" -gt 0 ]; then
+        _lm_file=$(<"${enh_asr_exp}/config.txt" sed -e "s/.*'lm_file': '\([^']*\)'.*$/\1/")
+        _lm_train_config=$(<"${enh_asr_exp}/config.txt" sed -e "s/.*'lm_train_config': '\([^']*\)'.*$/\1/")
+
+        lm_exp="${expdir}/${download_model}/lm"
+        mkdir -p "${lm_exp}"
+
+        ln -sf "${_lm_file}" "${lm_exp}"
+        ln -sf "${_lm_train_config}" "${lm_exp}"
+        inference_lm=$(basename "${_lm_file}")
+    fi
+
+fi
+
+
+if ! "${skip_eval}"; then
+    if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
+        log "Stage 12: Decoding: training_dir=${enh_asr_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        if "${use_lm}"; then
+            if "${use_word_lm}"; then
+                _opts+="--word_lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--word_lm_file ${lm_exp}/${inference_lm} "
+            else
+                _opts+="--lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            fi
+        fi
+        if "${use_ngram}"; then
+             _opts+="--ngram_file ${ngram_exp}/${inference_ngram}"
+        fi
+
+        # 2. Generate run.sh
+        log "Generate '${enh_asr_exp}/${inference_tag}/run.sh'. You can resume the process from stage 12 using this script"
+        mkdir -p "${enh_asr_exp}/${inference_tag}"; echo "${run_args} --stage 12 \"\$@\"; exit \$?" > "${enh_asr_exp}/${inference_tag}/run.sh"; chmod +x "${enh_asr_exp}/${inference_tag}/run.sh"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${enh_asr_exp}/${inference_tag}/${dset}"
+            _logdir="${_dir}/logdir"
+            mkdir -p "${_logdir}"
+
+            _feats_type="$(<${_data}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _scp=wav.scp
+                if [[ "${audio_format}" == *ark* ]]; then
+                    _type=kaldi_ark
+                else
+                    _type=sound
+                fi
+            else
+                _scp=feats.scp
+                _type=kaldi_ark
+            fi
+
+            # 1. Split the key file
+            key_file=${_data}/${_scp}
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            asr_inference_tool="espnet2.bin.asr_inference"
+
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit decoding jobs
+            log "Decoding started... log: '${_logdir}/asr_inference.*.log'"
+            # shellcheck disable=SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
+                ${python} -m ${asr_inference_tool} \
+                    --batch_size ${batch_size} \
+                    --ngpu "${_ngpu}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --asr_train_config "${enh_asr_exp}"/config.yaml \
+                    --asr_model_file "${enh_asr_exp}"/"${inference_enh_asr_model}" \
+                    --output_dir "${_logdir}"/output.JOB \
+                    ${_opts} ${asr_inference_args}
+
+            # 3. Concatenates the output files from each jobs
+            for f in token token_int score text; do
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | LC_ALL=C sort -k1 >"${_dir}/${f}"
+            done
+
+        done
+    fi
+
+
+    if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
+        log "Stage 13: Enhance Speech: training_dir=${enh_asr_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+
+        # 2. Generate run.sh
+        log "Generate '${enh_asr_exp}/${inference_tag}/run.sh'. You can resume the process from stage 13 using this script"
+        mkdir -p "${enh_asr_exp}/${inference_tag}"; echo "${run_args} --stage 13 \"\$@\"; exit \$?" > "${enh_asr_exp}/${inference_tag}/run.sh"; chmod +x "${enh_asr_exp}/${inference_tag}/run.sh"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${enh_asr_exp}/${inference_tag}/${dset}"
+            _logdir="${_dir}/logdir"
+            mkdir -p "${_logdir}"
+
+            _scp=wav.scp
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                _type=sound
+            fi
+
+            # 1. Split the key file
+            key_file=${_data}/${_scp}
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit inference jobs
+            log "Enhancement started... log: '${_logdir}/enh_inference.*.log'"
+            # shellcheck disable=SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/enh_inference.JOB.log \
+                ${python} -m espnet2.bin.enh_inference \
+                    --enh_s2t_task true \
+                    --ngpu "${_ngpu}" \
+                    --fs "${fs}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},speech_mix,${_type}" \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --train_config "${enh_asr_exp}"/config.yaml \
+                    --model_file "${enh_asr_exp}"/"${inference_enh_asr_model}" \
+                    --output_dir "${_logdir}"/output.JOB \
+                    ${_opts} ${enh_inference_args}
+
+            # 3. Concatenates the output files from each jobs
+            _spk_list=" "
+            for i in $(seq ${spk_num}); do
+                _spk_list+="spk${i} "
+            done
+
+            for spk in ${_spk_list}; do
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/${spk}.scp"
+                done | LC_ALL=C sort -k1 > "${_dir}/${spk}.scp"
+            done
+        done
+    fi
+
+    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
+        log "Stage 14: Scoring ASR"
+        if [ "${token_type}" = phn ]; then
+            log "Error: Not implemented for token_type=phn"
+            exit 1
+        fi
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${enh_asr_exp}/${inference_tag}/${dset}"
+
+            for _type in cer wer ter; do
+                [ "${_type}" = ter ] && [ ! -f "${bpemodel}" ] && continue
+
+                _scoredir="${_dir}/score_${_type}"
+                mkdir -p "${_scoredir}"
+
+                if [ "${_type}" = wer ]; then
+                    # Tokenize text to word level
+                    paste \
+                        <(<"${_data}/text" \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type word \
+                                  --non_linguistic_symbols "${nlsyms_txt}" \
+                                  --remove_non_linguistic_symbols true \
+                                  --cleaner "${cleaner}" \
+                                  ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/ref.trn"
+
+                    # NOTE(kamo): Don't use cleaner for hyp
+                    paste \
+                        <(<"${_dir}/text"  \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type word \
+                                  --non_linguistic_symbols "${nlsyms_txt}" \
+                                  --remove_non_linguistic_symbols true \
+                                  ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/hyp.trn"
+
+
+                elif [ "${_type}" = cer ]; then
+                    # Tokenize text to char level
+                    paste \
+                        <(<"${_data}/text" \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type char \
+                                  --non_linguistic_symbols "${nlsyms_txt}" \
+                                  --remove_non_linguistic_symbols true \
+                                  --cleaner "${cleaner}" \
+                                  ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/ref.trn"
+
+                    # NOTE(kamo): Don't use cleaner for hyp
+                    paste \
+                        <(<"${_dir}/text"  \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type char \
+                                  --non_linguistic_symbols "${nlsyms_txt}" \
+                                  --remove_non_linguistic_symbols true \
+                                  ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/hyp.trn"
+
+                elif [ "${_type}" = ter ]; then
+                    # Tokenize text using BPE
+                    paste \
+                        <(<"${_data}/text" \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type bpe \
+                                  --bpemodel "${bpemodel}" \
+                                  --cleaner "${cleaner}" \
+                                ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/ref.trn"
+
+                    # NOTE(kamo): Don't use cleaner for hyp
+                    paste \
+                        <(<"${_dir}/text" \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type bpe \
+                                  --bpemodel "${bpemodel}" \
+                                  ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/hyp.trn"
+
+                fi
+
+                sclite \
+            ${score_opts} \
+                    -r "${_scoredir}/ref.trn" trn \
+                    -h "${_scoredir}/hyp.trn" trn \
+                    -i rm -o all stdout > "${_scoredir}/result.txt"
+
+                log "Write ${_type} result in ${_scoredir}/result.txt"
+                grep -e Avg -e SPKR -m 2 "${_scoredir}/result.txt"
+            done
+        done
+
+        [ -f local/score.sh ] && local/score.sh ${local_score_opts} "${enh_asr_exp}"
+
+        # Show results in Markdown syntax
+        scripts/utils/show_asr_result.sh "${enh_asr_exp}" > "${enh_asr_exp}"/RESULTS.md
+        cat "${enh_asr_exp}"/RESULTS.md
+
+    fi
+
+    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
+        log "Stage 15: Scoring Enhancement"
+        _cmd=${decode_cmd}
+
+        # score_obs=true: Scoring for observation signal
+        # score_obs=false: Scoring for enhanced signal
+        # for score_obs in true false; do
+        for score_obs in true false; do
+            # Peform only at the first time for observation
+            if "${score_obs}" && [ -e "${data_feats}/RESULTS.md" ]; then
+                log "${data_feats}/RESULTS.md already exists. The scoring for observation will be skipped"
+                continue
+            fi
+
+            for dset in ${test_sets}; do
+                _data="${data_feats}/${dset}"
+                if "${score_obs}"; then
+                    _dir="${data_feats}/${dset}/scoring_enh"
+                else
+                    _dir="${enh_asr_exp}/${inference_tag}/${dset}/scoring_enh"
+                fi
+
+                _logdir="${_dir}/logdir"
+                mkdir -p "${_logdir}"
+
+                # 1. Split the key file
+                key_file=${_data}/wav.scp
+                split_scps=""
+                _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+                for n in $(seq "${_nj}"); do
+                    split_scps+=" ${_logdir}/keys.${n}.scp"
+                done
+                # shellcheck disable=SC2086
+                utils/split_scp.pl "${key_file}" ${split_scps}
+
+                _ref_scp=
+                for spk in $(seq "${spk_num}"); do
+                    _ref_scp+="--ref_scp ${_data}/spk${spk}.scp "
+                done
+                _inf_scp=
+                for spk in $(seq "${spk_num}"); do
+                    if "${score_obs}"; then
+                        # To compute the score of observation, input original wav.scp
+                        _inf_scp+="--inf_scp ${data_feats}/${dset}/wav.scp "
+                    else
+                        _inf_scp+="--inf_scp ${enh_asr_exp}/${inference_tag}/${dset}/spk${spk}.scp "
+                    fi
+                done
+
+                # 2. Submit scoring jobs
+                log "Scoring started... log: '${_logdir}/enh_scoring.*.log'"
+                # shellcheck disable=SC2086
+                ${_cmd} JOB=1:"${_nj}" "${_logdir}"/enh_scoring.JOB.log \
+                    ${python} -m espnet2.bin.enh_scoring \
+                        --key_file "${_logdir}"/keys.JOB.scp \
+                        --output_dir "${_logdir}"/output.JOB \
+                        ${_ref_scp} \
+                        ${_inf_scp} \
+                        --ref_channel ${ref_channel}
+
+                for spk in $(seq "${spk_num}"); do
+                    for protocol in ${scoring_protocol} wav; do
+                        for i in $(seq "${_nj}"); do
+                            cat "${_logdir}/output.${i}/${protocol}_spk${spk}"
+                        done | LC_ALL=C sort -k1 > "${_dir}/${protocol}_spk${spk}"
+                    done
+                done
+
+
+                for protocol in ${scoring_protocol}; do
+                    # shellcheck disable=SC2046
+                    paste $(for j in $(seq ${spk_num}); do echo "${_dir}"/"${protocol}"_spk"${j}" ; done)  |
+                    awk 'BEGIN{sum=0}
+                        {n=0;score=0;for (i=2; i<=NF; i+=2){n+=1;score+=$i}; sum+=score/n}
+                        END{printf ("%.2f\n",sum/NR)}' > "${_dir}/result_${protocol,,}.txt"
+                done
+            done
+
+            ./scripts/utils/show_enh_score.sh "${_dir}/../.." > "${_dir}/../../RESULTS_enh.md"
+        done
+        log "Evaluation result for observation: ${data_feats}/RESULTS_enh.md"
+        log "Evaluation result for enhancement: ${enh_asr_exp}/enhanced/RESULTS_enh.md"
+
+    fi
+else
+    log "Skip the evaluation stages"
+fi
+
+
+packed_model="${enh_asr_exp}/${enh_asr_exp##*/}_${inference_enh_asr_model%.*}.zip"
+if ! "${skip_upload_hf}"; then
+    if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
+        log "Stage 16: Pack model: ${packed_model}"
+
+        _opts=
+        if "${use_lm}"; then
+            _opts+="--lm_train_config ${lm_exp}/config.yaml "
+            _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            _opts+="--option ${lm_exp}/perplexity_test/ppl "
+            _opts+="--option ${lm_exp}/images "
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            _opts+="--option ${enh_asr_stats_dir}/train/feats_stats.npz "
+        fi
+        if [ "${token_type}" = bpe ]; then
+            _opts+="--option ${bpemodel} "
+        fi
+        if [ "${nlsyms_txt}" != none ]; then
+            _opts+="--option ${nlsyms_txt} "
+        fi
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.pack enh_s2t \
+            --enh_s2t_train_config "${enh_asr_exp}"/config.yaml \
+            --enh_s2t_model_file "${enh_asr_exp}"/"${inference_enh_asr_model}" \
+            ${_opts} \
+            --option "${enh_asr_exp}"/RESULTS.md \
+            --option "${enh_asr_exp}"/RESULTS_enh.md \
+            --option "${enh_asr_exp}"/images \
+            --outpath "${packed_model}"
+    fi
+
+
+    if [ ${stage} -le 17 ] && [ ${stop_stage} -ge 17 ]; then
+        [ -z "${hf_repo}" ] && \
+            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
+            exit 1
+        log "Stage 17: Upload model to HuggingFace: ${hf_repo}"
+
+        gitlfs=$(git lfs --version 2> /dev/null || true)
+        [ -z "${gitlfs}" ] && \
+            log "ERROR: You need to install git-lfs first" && \
+            exit 1
+
+        dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
+        [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
+
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="git checkout $(git show -s --format=%H)"
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/asr1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+
+        # copy files in ${dir_repo}
+        unzip -o ${packed_model} -d ${dir_repo}
+        # Generate description file
+        # shellcheck disable=SC2034
+        hf_task=speech-enhancement-recognition
+        # shellcheck disable=SC2034     
+        espnet_task=EnhS2T
+        # shellcheck disable=SC2034
+        task_exp=${enh_st_exp}
+        eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
+
+        this_folder=${PWD}
+        cd ${dir_repo}
+        if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Update model"
+        fi
+        git push
+        cd ${this_folder}
+    fi
+else
+    log "Skip the uploading stages"
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/enh_asr1/local/path.sh b/egs2/TEMPLATE/enh_asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/TEMPLATE/enh_asr1/path.sh b/egs2/TEMPLATE/enh_asr1/path.sh
new file mode 100755
index 00000000000..d2b90a67653
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/path.sh
@@ -0,0 +1,22 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+
+# You need to change or unset NCCL_SOCKET_IFNAME according to your network environment
+# https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html#nccl-socket-ifname
+export NCCL_SOCKET_IFNAME="^lo,docker,virbr,vmnet,vboxnet"
+
+# NOTE(kamo): Source at the last to overwrite the setting
+. local/path.sh
diff --git a/egs2/TEMPLATE/enh_asr1/pyscripts b/egs2/TEMPLATE/enh_asr1/pyscripts
new file mode 120000
index 00000000000..90e7cf60b04
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/pyscripts
@@ -0,0 +1 @@
+../asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/audio b/egs2/TEMPLATE/enh_asr1/scripts/audio
new file mode 120000
index 00000000000..836e57dcd1d
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/audio
@@ -0,0 +1 @@
+../../enh1/scripts/audio
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/feats b/egs2/TEMPLATE/enh_asr1/scripts/feats
new file mode 120000
index 00000000000..8b492e66782
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/feats
@@ -0,0 +1 @@
+../../asr1/scripts/feats
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_HF_Readme.md b/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_HF_Readme.md
new file mode 120000
index 00000000000..137c5c9044a
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_HF_Readme.md
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/TEMPLATE_HF_Readme.md
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_Readme.md b/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_Readme.md
new file mode 120000
index 00000000000..3479c7ee724
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_Readme.md
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/TEMPLATE_Readme.md
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/create_README_file.py b/egs2/TEMPLATE/enh_asr1/scripts/utils/create_README_file.py
new file mode 120000
index 00000000000..0fe3405603d
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/create_README_file.py
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/create_README_file.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/download_from_google_drive.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/download_from_google_drive.sh
new file mode 120000
index 00000000000..b3c560c573c
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/download_from_google_drive.sh
@@ -0,0 +1 @@
+../../../../../utils/download_from_google_drive.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/evaluate_asr.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/evaluate_asr.sh
new file mode 120000
index 00000000000..cccf5bf788b
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/evaluate_asr.sh
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/evaluate_asr.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/get_model_names.py b/egs2/TEMPLATE/enh_asr1/scripts/utils/get_model_names.py
new file mode 120000
index 00000000000..b163314a6c5
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/get_model_names.py
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/get_model_names.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/perturb_enh_data_dir_speed.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/perturb_enh_data_dir_speed.sh
new file mode 120000
index 00000000000..0896188f3a1
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/perturb_enh_data_dir_speed.sh
@@ -0,0 +1 @@
+../../../enh1/scripts/utils/perturb_enh_data_dir_speed.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/show_asr_result.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/show_asr_result.sh
new file mode 120000
index 00000000000..ea34b243f2c
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/show_asr_result.sh
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/show_asr_result.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/show_enh_score.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/show_enh_score.sh
new file mode 120000
index 00000000000..6d6490d3760
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/show_enh_score.sh
@@ -0,0 +1 @@
+../../../enh1/scripts/utils/show_enh_score.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/upload_models_to_hub.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/upload_models_to_hub.sh
new file mode 120000
index 00000000000..aeae4732e4b
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/upload_models_to_hub.sh
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/upload_models_to_hub.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/setup.sh b/egs2/TEMPLATE/enh_asr1/setup.sh
new file mode 100755
index 00000000000..36799ce4a13
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/setup.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0 <target-dir>
+EOF
+)
+
+
+if [ $# -ne 1 ]; then
+    log "${help_message}"
+    log "Error: 1 positional argument is required."
+    exit 2
+fi
+
+
+dir=$1
+mkdir -p "${dir}"
+
+if [ ! -d "${dir}"/../../TEMPLATE ]; then
+    log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory."
+    exit 1
+fi
+
+targets=""
+
+# Copy
+for f in cmd.sh conf local; do
+    target="${dir}"/../../TEMPLATE/enh_asr1/"${f}"
+    cp -r "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to TEMPLATE
+for f in enh_asr.sh path.sh db.sh scripts pyscripts; do
+    target=../../TEMPLATE/enh_asr1/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to Kaldi
+for f in steps utils; do
+    target=../../../tools/kaldi/egs/wsj/s5/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+log "Created: ${targets}"
diff --git a/egs2/TEMPLATE/enh_asr1/steps b/egs2/TEMPLATE/enh_asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/utils b/egs2/TEMPLATE/enh_asr1/utils
new file mode 120000
index 00000000000..6d93948f170
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils/
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_st1/cmd.sh b/egs2/TEMPLATE/enh_st1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/TEMPLATE/enh_st1/conf/fbank.conf b/egs2/TEMPLATE/enh_st1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/TEMPLATE/enh_st1/conf/pbs.conf b/egs2/TEMPLATE/enh_st1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/TEMPLATE/enh_st1/conf/pitch.conf b/egs2/TEMPLATE/enh_st1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/TEMPLATE/enh_st1/conf/queue.conf b/egs2/TEMPLATE/enh_st1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/TEMPLATE/enh_st1/conf/slurm.conf b/egs2/TEMPLATE/enh_st1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/TEMPLATE/enh_st1/db.sh b/egs2/TEMPLATE/enh_st1/db.sh
new file mode 120000
index 00000000000..318d781d123
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/db.sh
@@ -0,0 +1 @@
+../asr1/db.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_st1/enh_st.sh b/egs2/TEMPLATE/enh_st1/enh_st.sh
new file mode 100755
index 00000000000..68eec2a7665
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/enh_st.sh
@@ -0,0 +1,1806 @@
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+min() {
+  local a b
+  a=$1
+  for b in "$@"; do
+      if [ "${b}" -le "${a}" ]; then
+          a="${b}"
+      fi
+  done
+  echo "${a}"
+}
+SECONDS=0
+
+# General configuration
+stage=1              # Processes starts from the specified stage.
+stop_stage=10000     # Processes is stopped at the specified stage.
+skip_data_prep=false # Skip data preparation stages.
+skip_train=false     # Skip training stages.
+skip_eval=false      # Skip decoding and evaluation stages.
+skip_upload_hf=true  # Skip uploading to hugging face stages.
+ngpu=1               # The number of gpus ("0" uses cpu, otherwise use gpu).
+num_nodes=1          # The number of nodes.
+nj=32                # The number of parallel jobs.
+inference_nj=32      # The number of parallel jobs in decoding.
+gpu_inference=false  # Whether to perform gpu decoding.
+dumpdir=dump         # Directory to dump features.
+expdir=exp           # Directory to save experiments.
+python=python3       # Specify python to execute espnet commands.
+
+# Data preparation related
+local_data_opts= # The options given to local/data.sh.
+
+# Speed perturbation related
+speed_perturb_factors=  # perturbation factors, e.g. "0.9 1.0 1.1" (separated by space).
+
+# Feature extraction related
+feats_type=raw       # Feature type (raw or fbank_pitch).
+audio_format=flac    # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
+fs=16k               # Sampling rate.
+min_wav_duration=0.1 # Minimum duration in second.
+max_wav_duration=20  # Maximum duration in second.
+
+# Tokenization related
+oov="<unk>"         # Out of vocabulary symbol.
+blank="<blank>"     # CTC blank symbol
+sos_eos="<sos/eos>" # sos and eos symbole
+token_joint=false       # whether to use a single bpe system for both source and target languages
+src_case=lc.rm
+src_token_type=bpe      # Tokenization type (char or bpe) for source languages.
+src_nbpe=30             # The number of BPE vocabulary for source language.
+src_bpemode=unigram     # Mode of BPE for source language (unigram or bpe).
+src_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for source language.
+src_bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE of source language
+src_bpe_char_cover=1.0  # character coverage when modeling BPE for source language
+tgt_case=tc
+tgt_token_type=bpe      # Tokenization type (char or bpe) for target language.
+tgt_nbpe=30             # The number of BPE vocabulary for target language.
+tgt_bpemode=unigram     # Mode of BPE (unigram or bpe) for target language.
+tgt_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for target language.
+tgt_bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE for target language.
+tgt_bpe_char_cover=1.0  # character coverage when modeling BPE for target language.
+
+# Ngram model related
+use_ngram=false
+ngram_exp=
+ngram_num=3
+
+# Language model related
+use_lm=true       # Use language model for ST decoding.
+lm_tag=           # Suffix to the result dir for language model training.
+lm_exp=           # Specify the directory path for LM experiment.
+                  # If this option is specified, lm_tag is ignored.
+lm_stats_dir=     # Specify the directory path for LM statistics.
+lm_config=        # Config for language model training.
+lm_args=          # Arguments for language model training, e.g., "--max_epoch 10".
+                  # Note that it will overwrite args in lm config.
+use_word_lm=false # Whether to use word language model.
+num_splits_lm=1   # Number of splitting for lm corpus.
+# shellcheck disable=SC2034
+word_vocab_size=10000 # Size of word vocabulary.
+
+# ST model related
+enh_st_tag=        # Suffix to the result dir for st model training.
+enh_st_exp=        # Specify the directory path for ST experiment.
+               # If this option is specified, enh_st_tag is ignored.
+enh_st_stats_dir=  # Specify the directory path for ST statistics.
+enh_st_config=     # Config for st model training.
+enh_st_args=       # Arguments for st model training, e.g., "--max_epoch 10".
+                   # Note that it will overwrite args in st config.
+pretrained_model=          # Pretrained model to load
+ignore_init_mismatch=false # Ignore initial mismatch
+feats_normalize=global_mvn # Normalizaton layer type.
+num_splits_st=1            # Number of splitting for lm corpus.
+src_lang=es                # source language abbrev. id (e.g., es)
+tgt_lang=en                # target language abbrev. id (e.g., en)
+
+# Upload model related
+hf_repo=
+
+# Decoding related
+use_k2=false        # Whether to use k2 based decoder
+batch_size=1
+inference_tag=      # Suffix to the result dir for decoding.
+inference_config=   # Config for decoding.
+st_inference_args=  # Arguments for decoding, e.g., "--lm_weight 0.1".
+                    # Note that it will overwrite args in inference config.
+enh_inference_args="--normalize_output_wav true"
+inference_lm=valid.loss.ave.pth       # Language model path for decoding.
+inference_ngram=${ngram_num}gram.bin
+inference_enh_st_model=valid.acc.ave.pth # ST model path for decoding.
+                                      # e.g.
+                                      # inference_enh_st_model=train.loss.best.pth
+                                      # inference_enh_st_model=3epoch.pth
+                                      # inference_enh_st_model=valid.acc.best.pth
+                                      # inference_enh_st_model=valid.loss.ave.pth
+download_model= # Download a model from Model Zoo and use it for decoding.
+
+# Enhancement related arguments
+spk_num=1   # Number of speakers
+noise_type_num=1
+dereverb_ref_num=1
+# Evaluation related
+enh_inference_args="--normalize_output_wav true"
+scoring_protocol="STOI SDR SAR SIR SI_SNR"
+ref_channel=0
+
+# Enh Training data related
+use_dereverb_ref=false
+use_noise_ref=false
+
+# [Task dependent] Set the datadir name created by local/data.sh
+train_set=       # Name of training set.
+valid_set=       # Name of validation set used for monitoring/tuning network training.
+test_sets=       # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
+src_bpe_train_text=  # Text file path of bpe training set for source language.
+tgt_bpe_train_text=  # Text file path of bpe training set for target language.
+lm_train_text=   # Text file path of language model training set.
+lm_dev_text=     # Text file path of language model development set.
+lm_test_text=    # Text file path of language model evaluation set.
+nlsyms_txt=none  # Non-linguistic symbol list if existing.
+cleaner=none     # Text cleaner.
+g2p=none         # g2p method (needed if token_type=phn).
+lang=noinfo      # The language type of corpus.
+score_opts=                # The options given to sclite scoring
+local_score_opts=          # The options given to local/score.sh.
+enh_st_speech_fold_length=800 # fold_length for speech data during ST training.
+enh_st_text_fold_length=150   # fold_length for text data during ST training.
+lm_fold_length=150         # fold_length for LM training.
+
+help_message=$(cat << EOF
+Usage: $0 --train-set "<train_set_name>" --valid-set "<valid_set_name>" --test_sets "<test_set_names>"
+
+Options:
+    # General configuration
+    --stage          # Processes starts from the specified stage (default="${stage}").
+    --stop_stage     # Processes is stopped at the specified stage (default="${stop_stage}").
+    --skip_data_prep # Skip data preparation stages (default="${skip_data_prep}").
+    --skip_train     # Skip training stages (default="${skip_train}").
+    --skip_eval      # Skip decoding and evaluation stages (default="${skip_eval}").
+    --skip_upload_hf    # Skip packing and uploading stages (default="${skip_upload_hf}").
+    --ngpu           # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
+    --num_nodes      # The number of nodes (default="${num_nodes}").
+    --nj             # The number of parallel jobs (default="${nj}").
+    --inference_nj   # The number of parallel jobs in decoding (default="${inference_nj}").
+    --gpu_inference  # Whether to perform gpu decoding (default="${gpu_inference}").
+    --dumpdir        # Directory to dump features (default="${dumpdir}").
+    --expdir         # Directory to save experiments (default="${expdir}").
+    --python         # Specify python to execute espnet commands (default="${python}").
+
+    # Data preparation related
+    --local_data_opts # The options given to local/data.sh (default="${local_data_opts}").
+
+    # Speed perturbation related
+    --speed_perturb_factors # speed perturbation factors, e.g. "0.9 1.0 1.1" (separated by space, default="${speed_perturb_factors}").
+
+    # Feature extraction related
+    --feats_type       # Feature type (raw, fbank_pitch or extracted, default="${feats_type}").
+    --audio_format     # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw, default="${audio_format}").
+    --fs               # Sampling rate (default="${fs}").
+    --min_wav_duration # Minimum duration in second (default="${min_wav_duration}").
+    --max_wav_duration # Maximum duration in second (default="${max_wav_duration}").
+
+    # Tokenization related
+    --oov                     # Out of vocabulary symbol (default="${oov}").
+    --blank                   # CTC blank symbol (default="${blank}").
+    --sos_eos                 # sos and eos symbole (default="${sos_eos}").
+    --token_joint=false       # Whether to use a single bpe system for both source and target languages.
+                              # if set as true, will use tgt_* for processing (default="${token_joint}").
+    --src_token_type=bpe      # Tokenization type (char or bpe) for source languages. (default="${src_token_type}").
+    --src_nbpe=30             # The number of BPE vocabulary for source language. (default="${src_nbpe}").
+    --src_bpemode=unigram     # Mode of BPE for source language (unigram or bpe). (default="${src_bpemode}").
+    --src_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for source language. (default="${src_bpe_input_sentence_size}").
+    --src_bpe_nlsyms=         # Non-linguistic symbols list, separated by a comma, for BPE of source language. (default="${src_bpe_nlsyms}").
+    --src_bpe_char_cover=1.0  # Character coverage when modeling BPE for source language. (default="${src_bpe_char_cover}").
+    --tgt_token_type=bpe      # Tokenization type (char or bpe) for target language. (default="${tgt_token_type}").
+    --tgt_nbpe=30             # The number of BPE vocabulary for target language. (default="${tgt_nbpe}").
+    --tgt_bpemode=unigram     # Mode of BPE (unigram or bpe) for target language. (default="${tgt_bpemode}").
+    --tgt_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for target language. (default="${tgt_bpe_input_sentence_size}").
+    --tgt_bpe_nlsyms=         # Non-linguistic symbols list, separated by a comma, for BPE for target language. (default="${tgt_bpe_nlsyms}").
+    --tgt_bpe_char_cover=1.0  # Character coverage when modeling BPE for target language. (default="${tgt_bpe_char_cover}").
+
+    # Language model related
+    --lm_tag          # Suffix to the result dir for language model training (default="${lm_tag}").
+    --lm_exp          # Specify the directory path for LM experiment.
+                      # If this option is specified, lm_tag is ignored (default="${lm_exp}").
+    --lm_stats_dir    # Specify the directory path for LM statistics (default="${lm_stats_dir}").
+    --lm_config       # Config for language model training (default="${lm_config}").
+    --lm_args         # Arguments for language model training (default="${lm_args}").
+                      # e.g., --lm_args "--max_epoch 10"
+                      # Note that it will overwrite args in lm config.
+    --use_word_lm     # Whether to use word language model (default="${use_word_lm}").
+    --word_vocab_size # Size of word vocabulary (default="${word_vocab_size}").
+    --num_splits_lm   # Number of splitting for lm corpus (default="${num_splits_lm}").
+
+    # ST model related
+    --enh_st_tag           # Suffix to the result dir for st model training (default="${enh_st_tag}").
+    --enh_st_exp           # Specify the directory path for ST experiment.
+                       # If this option is specified, enh_st_tag is ignored (default="${enh_st_exp}").
+    --enh_st_stats_dir     # Specify the directory path for ST statistics (default="${enh_st_stats_dir}").
+    --enh_st_config        # Config for st model training (default="${enh_st_config}").
+    --enh_st_args          # Arguments for st model training (default="${enh_st_args}").
+                           # e.g., --enh_st_args "--max_epoch 10"
+                           # Note that it will overwrite args in st config.
+    --pretrained_model=          # Pretrained model to load (default="${pretrained_model}").
+    --ignore_init_mismatch=      # Ignore mismatch parameter init with pretrained model (default="${ignore_init_mismatch}").
+    --feats_normalize  # Normalizaton layer type. (default="${feats_normalize}").
+    --num_splits_st    # Number of splitting for lm corpus.  (default="${num_splits_st}").
+    --src_lang=        # source language abbrev. id (e.g., es). (default="${src_lang}")
+    --tgt_lang=        # target language abbrev. id (e.g., en). (default="${tgt_lang}")
+
+    # Decoding related
+    --inference_tag       # Suffix to the result dir for decoding (default="${inference_tag}").
+    --inference_config    # Config for decoding (default="${inference_config}").
+    --st_inference_args   # Arguments for decoding (default="${st_inference_args}").
+                          # e.g., --st_inference_args "--lm_weight 0.1"
+                          # Note that it will overwrite args in inference config.
+    --enh_inference_args     # Arguments for enhancement (default="${enh_inference_args}").
+    --inference_lm        # Language model path for decoding (default="${inference_lm}").
+    --inference_enh_st_model # ST model path for decoding (default="${inference_enh_st_model}").
+    --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
+
+    --spk_num             # number of speakers
+    --noise_type_num   # Number of noise types in the input audio (default="${noise_type_num}")
+    --dereverb_ref_num # Number of references for dereverberation (default="${dereverb_ref_num}")
+    --use_dereverb_ref # Whether or not to use dereverberated signal as an additional reference
+                         for training a dereverberation model (default="${use_dereverb_ref}")
+    --use_noise_ref    # Whether or not to use noise signal as an additional reference
+                         for training a denoising model (default="${use_noise_ref}")
+    # Enhancement Evaluation related
+    --scoring_protocol    # Metrics to be used for scoring (default="${scoring_protocol}")
+    --ref_channel         # Reference channel of the reference speech will be used if the model
+                            output is single-channel and reference speech is multi-channel
+                            (default="${ref_channel}")
+    # [Task dependent] Set the datadir name created by local/data.sh
+    --train_set     # Name of training set (required).
+    --valid_set     # Name of validation set used for monitoring/tuning network training (required).
+    --test_sets     # Names of test sets.
+                    # Multiple items (e.g., both dev and eval sets) can be specified (required).
+    --src_bpe_train_text # Text file path of bpe training set for source language.
+    --tgt_bpe_train_text # Text file path of bpe training set for target language
+    --lm_train_text  # Text file path of language model training set.
+    --lm_dev_text   # Text file path of language model development set (default="${lm_dev_text}").
+    --lm_test_text  # Text file path of language model evaluation set (default="${lm_test_text}").
+    --nlsyms_txt    # Non-linguistic symbol list if existing (default="${nlsyms_txt}").
+    --cleaner       # Text cleaner (default="${cleaner}").
+    --g2p           # g2p method (default="${g2p}").
+    --lang          # The language type of corpus (default=${lang}).
+    --score_opts             # The options given to sclite scoring (default="{score_opts}").
+    --local_score_opts       # The options given to local/score.sh (default="{local_score_opts}").
+    --enh_st_speech_fold_length # fold_length for speech data during ST training (default="${enh_st_speech_fold_length}").
+    --enh_st_text_fold_length   # fold_length for text data during ST training (default="${enh_st_text_fold_length}").
+    --lm_fold_length         # fold_length for LM training (default="${lm_fold_length}").
+EOF
+)
+
+log "$0 $*"
+# Save command line args for logging (they will be lost after utils/parse_options.sh)
+run_args=$(pyscripts/utils/print_args.py $0 "$@")
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "${help_message}"
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+# Check required arguments
+[ -z "${train_set}" ] && { log "${help_message}"; log "Error: --train_set is required"; exit 2; };
+[ -z "${valid_set}" ] && { log "${help_message}"; log "Error: --valid_set is required"; exit 2; };
+[ -z "${test_sets}" ] && { log "${help_message}"; log "Error: --test_sets is required"; exit 2; };
+
+[ ${spk_num} -gt 1 ] && { log "${help_message}"; log "Error: --spk_num only 1 is supported"; exit 2; };
+
+# Check feature type
+if [ "${feats_type}" = raw ]; then
+    data_feats=${dumpdir}/raw
+elif [ "${feats_type}" = fbank_pitch ]; then
+    data_feats=${dumpdir}/fbank_pitch
+elif [ "${feats_type}" = fbank ]; then
+    data_feats=${dumpdir}/fbank
+elif [ "${feats_type}" == extracted ]; then
+    data_feats=${dumpdir}/extracted
+else
+    log "${help_message}"
+    log "Error: not supported: --feats_type ${feats_type}"
+    exit 2
+fi
+
+# Extra files for translation process
+utt_extra_files="text.${src_case}.${src_lang} text.${tgt_case}.${tgt_lang}"
+# Extra files for enhancement process
+utt_extra_files+=" utt2category"
+# Use the same text as ST for bpe training if not specified.
+[ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}"
+[ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+# Use the same text as ST for lm training if not specified.
+[ -z "${lm_train_text}" ] && lm_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+# Use the same text as ST for lm training if not specified.
+[ -z "${lm_dev_text}" ] && lm_dev_text="${data_feats}/${valid_set}/text.${tgt_case}.${tgt_lang}"
+# Use the text of the 1st evaldir if lm_test is not specified
+[ -z "${lm_test_text}" ] && lm_test_text="${data_feats}/${test_sets%% *}/text.${tgt_case}.${tgt_lang}"
+
+# Check tokenization type
+if [ "${lang}" != noinfo ]; then
+    token_listdir=data/${lang}_token_list
+else
+    token_listdir=data/token_list
+fi
+# The tgt bpedir is set for all cases when using bpe
+tgt_bpedir="${token_listdir}/tgt_bpe_${tgt_bpemode}${tgt_nbpe}"
+tgt_bpeprefix="${tgt_bpedir}"/bpe
+tgt_bpemodel="${tgt_bpeprefix}".model
+tgt_bpetoken_list="${tgt_bpedir}"/tokens.txt
+tgt_chartoken_list="${token_listdir}"/char/tgt_tokens.txt
+if "${token_joint}"; then
+    # if token_joint, the bpe training will use both src_lang and tgt_lang to train a single bpe model
+    src_bpedir="${tgt_bpedir}"
+    src_bpeprefix="${tgt_bpeprefix}"
+    src_bpemodel="${tgt_bpemodel}"
+    src_bpetoken_list="${tgt_bpetoken_list}"
+    src_chartoken_list="${tgt_chartoken_list}"
+else
+    src_bpedir="${token_listdir}/src_bpe_${tgt_bpemode}${tgt_nbpe}"
+    src_bpeprefix="${src_bpedir}"/bpe
+    src_bpemodel="${src_bpeprefix}".model
+    src_bpetoken_list="${src_bpedir}"/tokens.txt
+    src_chartoken_list="${token_listdir}"/char/src_tokens.txt
+fi
+
+# NOTE: keep for future development.
+# shellcheck disable=SC2034
+tgt_wordtoken_list="${token_listdir}"/word/tgt_tokens.txt
+if "${token_joint}"; then
+    src_wordtoken_list="${tgt_wordtoken_list}"
+else
+    src_wordtoken_list="${token_listdir}"/word/src_tokens.txt
+fi
+
+# Set token types for src and tgt langs
+if [ "${src_token_type}" = bpe ]; then
+    src_token_list="${src_bpetoken_list}"
+elif [ "${src_token_type}" = char ]; then
+    src_token_list="${src_chartoken_list}"
+    src_bpemodel=none
+elif [ "${src_token_type}" = word ]; then
+    src_token_list="${src_wordtoken_list}"
+    src_bpemodel=none
+else
+    log "Error: not supported --src_token_type '${src_token_type}'"
+    exit 2
+fi
+if [ "${tgt_token_type}" = bpe ]; then
+    tgt_token_list="${tgt_bpetoken_list}"
+elif [ "${tgt_token_type}" = char ]; then
+    tgt_token_list="${tgt_chartoken_list}"
+    tgt_bpemodel=none
+elif [ "${tgt_token_type}" = word ]; then
+    tgt_token_list="${tgt_wordtoken_list}"
+    tgt_bpemodel=none
+else
+    log "Error: not supported --tgt_token_type '${tgt_token_type}'"
+    exit 2
+fi
+if ${use_word_lm}; then
+    log "Error: Word LM is not supported yet"
+    exit 2
+
+    lm_token_list="${tgt_wordtoken_list}"
+    lm_token_type=word
+else
+    lm_token_list="${tgt_token_list}"
+    lm_token_type="${tgt_token_type}"
+fi
+
+
+# Set tag for naming of model directory
+if [ -z "${enh_st_tag}" ]; then
+    if [ -n "${enh_st_config}" ]; then
+        enh_st_tag="$(basename "${enh_st_config}" .yaml)_${feats_type}"
+    else
+        enh_st_tag="train_${feats_type}"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        enh_st_tag+="_${lang}_${tgt_token_type}_${tgt_case}"
+    else
+        enh_st_tag+="_${tgt_token_type}_${tgt_case}"
+    fi
+    if [ "${tgt_token_type}" = bpe ]; then
+        enh_st_tag+="${tgt_nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${enh_st_args}" ]; then
+        enh_st_tag+="$(echo "${enh_st_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        enh_st_tag+="_sp"
+    fi
+fi
+if [ -z "${lm_tag}" ]; then
+    if [ -n "${lm_config}" ]; then
+        lm_tag="$(basename "${lm_config}" .yaml)"
+    else
+        lm_tag="train"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        lm_tag+="_${lang}_${lm_token_type}"
+    else
+        lm_tag+="_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_tag+="${tgt_nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${lm_args}" ]; then
+        lm_tag+="$(echo "${lm_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+fi
+
+# The directory used for collect-stats mode
+if [ -z "${enh_st_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        enh_st_stats_dir="${expdir}/enh_st_stats_${feats_type}_${lang}_${tgt_token_type}"
+    else
+        enh_st_stats_dir="${expdir}/enh_st_stats_${feats_type}_${tgt_token_type}"
+    fi
+    if [ "${tgt_token_type}" = bpe ]; then
+        enh_st_stats_dir+="${tgt_nbpe}"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        enh_st_stats_dir+="_sp"
+    fi
+fi
+if [ -z "${lm_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        lm_stats_dir="${expdir}/lm_stats_${lang}_${lm_token_type}"
+    else
+        lm_stats_dir="${expdir}/lm_stats_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_stats_dir+="${tgt_nbpe}"
+    fi
+fi
+# The directory used for training commands
+if [ -z "${enh_st_exp}" ]; then
+    enh_st_exp="${expdir}/enh_st_${enh_st_tag}"
+fi
+if [ -z "${lm_exp}" ]; then
+    lm_exp="${expdir}/lm_${lm_tag}"
+fi
+if [ -z "${ngram_exp}" ]; then
+    ngram_exp="${expdir}/ngram"
+fi
+
+
+if [ -z "${inference_tag}" ]; then
+    if [ -n "${inference_config}" ]; then
+        inference_tag="$(basename "${inference_config}" .yaml)"
+    else
+        inference_tag=inference
+    fi
+    # Add overwritten arg's info
+    if [ -n "${st_inference_args}" ]; then
+        inference_tag+="$(echo "${st_inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if [ -n "${enh_inference_args}" ]; then
+        inference_tag+="$(echo "${enh_inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if "${use_lm}"; then
+        inference_tag+="_lm_$(basename "${lm_exp}")_$(echo "${inference_lm}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    if "${use_ngram}"; then
+        inference_tag+="_ngram_$(basename "${ngram_exp}")_$(echo "${inference_ngram}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    inference_tag+="_enh_st_model_$(echo "${inference_enh_st_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+
+    if "${use_k2}"; then
+      inference_tag+="_use_k2"
+    fi
+fi
+
+# ========================== Main stages start from here. ==========================
+
+if ! "${skip_data_prep}"; then
+    if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+        log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc."
+        # [Task dependent] Need to create data.sh for new corpus
+        local/data.sh ${local_data_opts}
+    fi
+
+    if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+        if [ -n "${speed_perturb_factors}" ]; then
+            log "Stage 2: Speed perturbation: data/${train_set} -> data/${train_set}_sp"
+
+            _scp_list="wav.scp "
+            for i in $(seq ${spk_num}); do
+                _scp_list+="spk${i}.scp "
+            done
+
+            for factor in ${speed_perturb_factors}; do
+                if [[ $(bc <<<"${factor} != 1.0") == 1 ]]; then
+                    scripts/utils/perturb_enh_data_dir_speed.sh --utt_extra_files "${utt_extra_files}" \
+                         "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}" "${_scp_list}"
+                    _dirs+="data/${train_set}_sp${factor} "
+                else
+                    # If speed factor is 1, same as the original
+                    _dirs+="data/${train_set} "
+                fi
+            done
+            utils/combine_data.sh --extra_files "${utt_extra_files} ${_scp_list}" "data/${train_set}_sp" ${_dirs}
+            for extra_file in ${utt_extra_files}; do
+                python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp 
+                mv data/"${train_set}_sp"/${extra_file}.tmp data/"${train_set}_sp"/${extra_file}
+            done
+        else
+           log "Skip stage 2: Speed perturbation"
+        fi
+    fi
+
+    if [ -n "${speed_perturb_factors}" ]; then
+        train_set="${train_set}_sp"
+    fi
+
+    if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+        if [ "${feats_type}" = raw ]; then
+            log "Stage 3: Format wav.scp: data/ -> ${data_feats}"
+
+            # ====== Recreating "wav.scp" ======
+            # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
+            # shouldn't be used in training process.
+            # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
+            # and it can also change the audio-format and sampling rate.
+            # If nothing is need, then format_wav_scp.sh does nothing:
+            # i.e. the input file format and rate is same as the output.
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                # expand the utt_extra_files for multi-references
+                expand_utt_extra_files=""
+                for extra_file in ${utt_extra_files}; do
+                    # with regex to suuport multi-references
+                    for single_file in $(ls data/"${dset}"/${extra_file}*); do
+                        cp ${single_file} "${data_feats}${_suf}/${dset}"
+                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
+                    done 
+                done
+                echo "${expand_utt_extra_files}"
+                utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}" "${data_feats}${_suf}/${dset}"
+                for extra_file in ${expand_utt_extra_files}; do
+                    LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}"
+                done
+
+                rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel,reco2dur}
+                _opts=
+                if [ -e data/"${dset}"/segments ]; then
+                    # "segments" is used for splitting wav files which are written in "wav".scp
+                    # into utterances. The file format of segments:
+                    #   <segment_id> <record_id> <start_time> <end_time>
+                    #   "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5"
+                    # Where the time is written in seconds.
+                    _opts+="--segments data/${dset}/segments "
+                fi
+
+                _spk_list=" "
+                for i in $(seq ${spk_num}); do
+                    _spk_list+="spk${i} "
+                done
+                if ${use_noise_ref} && [ -n "${_suf}" ]; then
+                    # references for denoising ("noise1 noise2 ... niose${noise_type_num} ")
+                    _spk_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n "; done)
+                fi
+                if ${use_dereverb_ref} && [ -n "${_suf}" ]; then
+                    # references for dereverberation
+                    _spk_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n "; done)
+                fi
+
+                for spk in ${_spk_list} "wav" ; do
+                    # shellcheck disable=SC2086
+                    scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                        --out-filename "${spk}.scp" \
+                        --ref_channels "0" \
+                        --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
+                        "data/${dset}/${spk}.scp" "${data_feats}${_suf}/${dset}" \
+                        "${data_feats}${_suf}/${dset}/logs/${spk}" "${data_feats}${_suf}/${dset}/data/${spk}"
+                done
+
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+
+        else
+            log "Error: not supported: --feats_type ${feats_type}"
+            exit 2
+        fi
+    fi
+
+
+    if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+        log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}"
+
+        # NOTE(kamo): Not applying to test_sets to keep original data
+        for dset in "${train_set}" "${valid_set}"; do
+            # Copy data dir
+            _spk_list=" "
+            _scp_list=" "
+            for i in $(seq ${spk_num}); do
+                _spk_list+="spk${i} "
+                _scp_list+="spk${i}.scp "
+            done
+            if $use_noise_ref; then
+                # references for denoising ("noise1 noise2 ... niose${noise_type_num} ")
+                _spk_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n "; done)
+                _scp_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n.scp "; done)
+            fi
+            if $use_dereverb_ref; then
+                # references for dereverberation
+                _spk_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n "; done)
+                _scp_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n.scp "; done)
+            fi
+
+            # Copy data dir
+            utils/copy_data_dir.sh --validate_opts --non-print "${data_feats}/org/${dset}" "${data_feats}/${dset}"
+            cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
+
+            for utt_extra_file in ${utt_extra_files}; do
+                cp "${data_feats}/org/${dset}/${utt_extra_file}" "${data_feats}/${dset}"
+            done
+
+            # Remove short utterances
+            _feats_type="$(<${data_feats}/${dset}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
+                _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
+
+                # utt2num_samples is created by format_wav_scp.sh
+                <"${data_feats}/org/${dset}/utt2num_samples" \
+                    awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
+                        >"${data_feats}/${dset}/utt2num_samples"
+                for spk in ${_spk_list} "wav"; do
+                    <"${data_feats}/org/${dset}/${spk}.scp" \
+                        utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
+                        >"${data_feats}/${dset}/${spk}.scp"
+                done
+            else
+                # Get frame shift in ms from conf/fbank.conf
+                _frame_shift=
+                if [ -f conf/fbank.conf ] && [ "$(<conf/fbank.conf grep -c frame-shift)" -gt 0 ]; then
+                    # Assume using conf/fbank.conf for feature extraction
+                    _frame_shift="$(<conf/fbank.conf grep frame-shift | sed -e 's/[-a-z =]*\([0-9]*\)/\1/g')"
+                fi
+                if [ -z "${_frame_shift}" ]; then
+                    # If not existing, use the default number in Kaldi (=10ms).
+                    # If you are using different number, you have to change the following value manually.
+                    _frame_shift=10
+                fi
+
+                _min_length=$(python3 -c "print(int(${min_wav_duration} / ${_frame_shift} * 1000))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} / ${_frame_shift} * 1000))")
+
+                cp "${data_feats}/org/${dset}/feats_dim" "${data_feats}/${dset}/feats_dim"
+                <"${data_feats}/org/${dset}/feats_shape" awk -F, ' { print $1 } ' \
+                    | awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length) print $0; }' \
+                        >"${data_feats}/${dset}/feats_shape"
+                <"${data_feats}/org/${dset}/feats.scp" \
+                    utils/filter_scp.pl "${data_feats}/${dset}/feats_shape"  \
+                    >"${data_feats}/${dset}/feats.scp"
+            fi
+
+            # Remove empty text
+            <"${data_feats}/org/${dset}/text" \
+                awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
+
+            # fix_data_dir.sh leaves only utts which exist in all files
+            utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" "${data_feats}/${dset}"
+            for utt_extra_file in ${utt_extra_files}; do
+                python pyscripts/utils/remove_duplicate_keys.py ${data_feats}/${dset}/${utt_extra_file} \
+                    > ${data_feats}/${dset}/${utt_extra_file}.tmp 
+                mv ${data_feats}/${dset}/${utt_extra_file}.tmp ${data_feats}/${dset}/${utt_extra_file}
+            done 
+        done
+
+        # shellcheck disable=SC2002
+        cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.txt"
+    fi
+
+    if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+        # Combine source and target texts when using joint tokenization
+        if "${token_joint}"; then
+            log "Merge src and target data if joint BPE"
+
+            cat $tgt_bpe_train_text > ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            [ ! -z "${src_bpe_train_text}" ] && cat ${src_bpe_train_text} >> ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            # Set the new text as the target text
+            tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}"
+        fi
+
+        # First generate tgt lang
+        if [ "${tgt_token_type}" = bpe ]; then
+            log "Stage 5a: Generate token_list from ${tgt_bpe_train_text} using BPE for tgt_lang"
+
+            mkdir -p "${tgt_bpedir}"
+            # shellcheck disable=SC2002
+            cat ${tgt_bpe_train_text} | cut -f 2- -d" "  > "${tgt_bpedir}"/train.txt
+
+            if [ -n "${tgt_bpe_nlsyms}" ]; then
+                _opts_spm="--user_defined_symbols=${tgt_bpe_nlsyms}"
+            else
+                _opts_spm=""
+            fi
+
+            spm_train \
+                --input="${tgt_bpedir}"/train.txt \
+                --vocab_size="${tgt_nbpe}" \
+                --model_type="${tgt_bpemode}" \
+                --model_prefix="${tgt_bpeprefix}" \
+                --character_coverage=${tgt_bpe_char_cover} \
+                --input_sentence_size="${tgt_bpe_input_sentence_size}" \
+                ${_opts_spm}
+
+            {
+            echo "${blank}"
+            echo "${oov}"
+            # Remove <unk>, <s>, </s> from the vocabulary
+            <"${tgt_bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+            echo "${sos_eos}"
+            } > "${tgt_token_list}"
+
+        elif [ "${tgt_token_type}" = char ] || [ "${tgt_token_type}" = word ]; then
+            log "Stage 5a: Generate character level token_list from ${tgt_bpe_train_text}  for tgt_lang"
+
+            _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+            # shellcheck disable=SC2002
+            cat ${tgt_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
+
+            # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+            # 0 is reserved for CTC-blank for ST and also used as ignore-index in the other task
+            ${python} -m espnet2.bin.tokenize_text  \
+                --token_type "${tgt_token_type}" \
+                --input "${data_feats}/token_train.txt" --output "${tgt_token_list}" ${_opts} \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+
+        else
+            log "Error: not supported --token_type '${tgt_token_type}'"
+            exit 2
+        fi
+
+        # Create word-list for word-LM training
+        if ${use_word_lm} && [ "${tgt_token_type}" != word ]; then
+            log "Generate word level token_list from ${data_feats}/lm_train.txt"
+            ${python} -m espnet2.bin.tokenize_text \
+                --token_type word \
+                --input "${data_feats}/lm_train.txt" --output "${lm_token_list}" \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --vocabulary_size "${word_vocab_size}" \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+        fi
+
+        # Then generate src lang
+        if "${token_joint}"; then
+            log "Stage 5b: Skip separate token construction for src_lang when setting ${token_joint} as true"
+        else
+            if [ "${src_token_type}" = bpe ]; then
+                log "Stage 5b: Generate token_list from ${src_bpe_train_text} using BPE for src_lang"
+
+                mkdir -p "${src_bpedir}"
+                # shellcheck disable=SC2002
+                cat ${src_bpe_train_text} | cut -f 2- -d" "  > "${src_bpedir}"/train.txt
+
+                if [ -n "${src_bpe_nlsyms}" ]; then
+                    _opts_spm="--user_defined_symbols=${src_bpe_nlsyms}"
+                else
+                    _opts_spm=""
+                fi
+
+                spm_train \
+                    --input="${src_bpedir}"/train.txt \
+                    --vocab_size="${src_nbpe}" \
+                    --model_type="${src_bpemode}" \
+                    --model_prefix="${src_bpeprefix}" \
+                    --character_coverage=${src_bpe_char_cover} \
+                    --input_sentence_size="${src_bpe_input_sentence_size}" \
+                    ${_opts_spm}
+
+                {
+                echo "${blank}"
+                echo "${oov}"
+                # Remove <unk>, <s>, </s> from the vocabulary
+                <"${src_bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+                echo "${sos_eos}"
+                } > "${src_token_list}"
+
+            elif [ "${src_token_type}" = char ] || [ "${src_token_type}" = word ]; then
+                log "Stage 5b: Generate character level token_list from ${src_bpe_train_text}  for src_lang"
+
+                _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+                # shellcheck disable=SC2002
+                cat ${src_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
+
+                # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+                # 0 is reserved for CTC-blank for ST and also used as ignore-index in the other task
+                ${python} -m espnet2.bin.tokenize_text  \
+                    --token_type "${src_token_type}" \
+                    --input "${data_feats}/token_train.txt" --output "${src_token_list}" ${_opts} \
+                    --field 2- \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --write_vocabulary true \
+                    --add_symbol "${blank}:0" \
+                    --add_symbol "${oov}:1" \
+                    --add_symbol "${sos_eos}:-1"
+
+            else
+                log "Error: not supported --token_type '${src_token_type}'"
+                exit 2
+            fi
+
+
+        fi
+    fi
+
+else
+    log "Skip the stages for data preparation"
+fi
+
+
+# ========================== Data preparation is done here. ==========================
+
+
+if ! "${skip_train}"; then
+    if "${use_lm}"; then
+        if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+            log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            # 1. Split the key file
+            _logdir="${lm_stats_dir}/logdir"
+            mkdir -p "${_logdir}"
+            # Get the minimum number among ${nj} and the number lines of input files
+            _nj=$(min "${nj}" "$(<${data_feats}/lm_train.txt wc -l)" "$(<${lm_dev_text} wc -l)")
+
+            key_file="${data_feats}/lm_train.txt"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/train.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            key_file="${lm_dev_text}"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/dev.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Generate run.sh
+            log "Generate '${lm_stats_dir}/run.sh'. You can resume the process from stage 6 using this script"
+            mkdir -p "${lm_stats_dir}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${lm_stats_dir}/run.sh"; chmod +x "${lm_stats_dir}/run.sh"
+
+            # 3. Submit jobs
+            log "LM collect-stats started... log: '${_logdir}/stats.*.log'"
+            # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+            #       but it's used only for deciding the sample ids.
+            # shellcheck disable=SC2086
+            ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+                ${python} -m espnet2.bin.lm_train \
+                    --collect_stats true \
+                    --use_preprocessor true \
+                    --bpemodel "${tgt_bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --train_data_path_and_name_and_type "${data_feats}/lm_train.txt,text,text" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --train_shape_file "${_logdir}/train.JOB.scp" \
+                    --valid_shape_file "${_logdir}/dev.JOB.scp" \
+                    --output_dir "${_logdir}/stats.JOB" \
+                    ${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+            # 4. Aggregate shape files
+            _opts=
+            for i in $(seq "${_nj}"); do
+                _opts+="--input_dir ${_logdir}/stats.${i} "
+            done
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"
+
+            # Append the num-tokens at the last dimensions. This is used for batch-bins count
+            <"${lm_stats_dir}/train/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/train/text_shape.${lm_token_type}"
+
+            <"${lm_stats_dir}/valid/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/valid/text_shape.${lm_token_type}"
+        fi
+
+
+        if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+            log "Stage 7: LM Training: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            if [ "${num_splits_lm}" -gt 1 ]; then
+                # If you met a memory error when parsing text files, this option may help you.
+                # The corpus is split into subsets and each subset is used for training one by one in order,
+                # so the memory footprint can be limited to the memory required for each dataset.
+
+                _split_dir="${lm_stats_dir}/splits${num_splits_lm}"
+                if [ ! -f "${_split_dir}/.done" ]; then
+                    rm -f "${_split_dir}/.done"
+                    ${python} -m espnet2.bin.split_scps \
+                      --scps "${data_feats}/lm_train.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
+                      --num_splits "${num_splits_lm}" \
+                      --output_dir "${_split_dir}"
+                    touch "${_split_dir}/.done"
+                else
+                    log "${_split_dir}/.done exists. Spliting is skipped"
+                fi
+
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
+                _opts+="--multiple_iterator true "
+
+            else
+                _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
+            fi
+
+            # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+
+            log "Generate '${lm_exp}/run.sh'. You can resume the process from stage 7 using this script"
+            mkdir -p "${lm_exp}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${lm_exp}/run.sh"; chmod +x "${lm_exp}/run.sh"
+
+            log "LM training started... log: '${lm_exp}/train.log'"
+            if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+                # SGE can't include "/" in a job name
+                jobname="$(basename ${lm_exp})"
+            else
+                jobname="${lm_exp}/train.log"
+            fi
+
+            # TODO(jiatong): fix bpe
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.launch \
+                --cmd "${cuda_cmd} --name ${jobname}" \
+                --log "${lm_exp}"/train.log \
+                --ngpu "${ngpu}" \
+                --num_nodes "${num_nodes}" \
+                --init_file_prefix "${lm_exp}"/.dist_init_ \
+                --multiprocessing_distributed true -- \
+                ${python} -m espnet2.bin.lm_train \
+                    --ngpu "${ngpu}" \
+                    --use_preprocessor true \
+                    --bpemodel "${tgt_bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --valid_shape_file "${lm_stats_dir}/valid/text_shape.${lm_token_type}" \
+                    --fold_length "${lm_fold_length}" \
+                    --resume true \
+                    --output_dir "${lm_exp}" \
+                    ${_opts} ${lm_args}
+
+        fi
+
+
+        if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+            log "Stage 8: Calc perplexity: ${lm_test_text}"
+            _opts=
+            # TODO(kamo): Parallelize?
+            log "Perplexity calculation started... log: '${lm_exp}/perplexity_test/lm_calc_perplexity.log'"
+            # shellcheck disable=SC2086
+            ${cuda_cmd} --gpu "${ngpu}" "${lm_exp}"/perplexity_test/lm_calc_perplexity.log \
+                ${python} -m espnet2.bin.lm_calc_perplexity \
+                    --ngpu "${ngpu}" \
+                    --data_path_and_name_and_type "${lm_test_text},text,text" \
+                    --train_config "${lm_exp}"/config.yaml \
+                    --model_file "${lm_exp}/${inference_lm}" \
+                    --output_dir "${lm_exp}/perplexity_test" \
+                    ${_opts}
+            log "PPL: ${lm_test_text}: $(cat ${lm_exp}/perplexity_test/ppl)"
+
+        fi
+
+    else
+        log "Stage 6-8: Skip lm-related stages: use_lm=${use_lm}"
+    fi
+
+
+    if "${use_ngram}"; then
+        mkdir -p ${ngram_exp}
+    fi
+    if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+        if "${use_ngram}"; then
+            log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt"
+            cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
+            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin 
+        else
+            log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
+        fi
+    fi
+
+
+    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+        _enh_st_train_dir="${data_feats}/${train_set}"
+        _enh_st_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 10: ST collect stats: train_set=${_enh_st_train_dir}, valid_set=${_enh_st_valid_dir}"
+
+        _opts=
+        if [ -n "${enh_st_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.enh_s2t_train --print_config --optim adam
+            _opts+="--config ${enh_st_config} "
+        fi
+
+        _feats_type="$(<${_enh_st_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                # "sound" supports "wav", "flac", etc.
+                _type=sound
+            fi
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _input_size="$(<${_enh_st_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+        fi
+
+        # 1. Split the key file
+        _logdir="${enh_st_stats_dir}/logdir"
+        mkdir -p "${_logdir}"
+
+        # Get the minimum number among ${nj} and the number lines of input files
+        _nj=$(min "${nj}" "$(<${_enh_st_train_dir}/${_scp} wc -l)" "$(<${_enh_st_valid_dir}/${_scp} wc -l)")
+
+        key_file="${_enh_st_train_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/train.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        key_file="${_enh_st_valid_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/valid.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        # 2. Generate run.sh
+        log "Generate '${enh_st_stats_dir}/run.sh'. You can resume the process from stage 10 using this script"
+        mkdir -p "${enh_st_stats_dir}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${enh_st_stats_dir}/run.sh"; chmod +x "${enh_st_stats_dir}/run.sh"
+
+        # 3. Submit jobs
+        log "ST collect-stats started... log: '${_logdir}/stats.*.log'"
+
+        # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+        #       but it's used only for deciding the sample ids.
+
+        # TODO(jiatong): fix different bpe model
+        # shellcheck disable=SC2086
+        ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+            ${python} -m espnet2.bin.enh_s2t_train \
+                --collect_stats true \
+                --use_preprocessor true \
+                --bpemodel "${tgt_bpemodel}" \
+                --src_bpemodel "${src_bpemodel}" \
+                --token_type "${tgt_token_type}" \
+                --src_token_type "${src_token_type}" \
+                --token_list "${tgt_token_list}" \
+                --src_token_list "${src_token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --train_data_path_and_name_and_type "${_enh_st_train_dir}/${_scp},speech,${_type}" \
+                --train_data_path_and_name_and_type "${_enh_st_train_dir}/${_scp},speech_ref1,${_type}" \
+                --train_data_path_and_name_and_type "${_enh_st_train_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --train_data_path_and_name_and_type "${_enh_st_train_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/${_scp},speech_ref1,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --train_shape_file "${_logdir}/train.JOB.scp" \
+                --valid_shape_file "${_logdir}/valid.JOB.scp" \
+                --output_dir "${_logdir}/stats.JOB" \
+                ${_opts} ${enh_st_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+        # 4. Aggregate shape files
+        _opts=
+        for i in $(seq "${_nj}"); do
+            _opts+="--input_dir ${_logdir}/stats.${i} "
+        done
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${enh_st_stats_dir}"
+
+        # Append the num-tokens at the last dimensions. This is used for batch-bins count
+        <"${enh_st_stats_dir}/train/text_shape" \
+            awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_st_stats_dir}/train/text_shape.${tgt_token_type}"
+
+        <"${enh_st_stats_dir}/train/src_text_shape" \
+            awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_st_stats_dir}/train/src_text_shape.${src_token_type}"
+
+        <"${enh_st_stats_dir}/valid/text_shape" \
+            awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_st_stats_dir}/valid/text_shape.${tgt_token_type}"
+
+        <"${enh_st_stats_dir}/valid/src_text_shape" \
+            awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_st_stats_dir}/valid/src_text_shape.${src_token_type}"
+    fi
+
+
+    if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then
+        _enh_st_train_dir="${data_feats}/${train_set}"
+        _enh_st_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 11: ST Training: train_set=${_enh_st_train_dir}, valid_set=${_enh_st_valid_dir}"
+
+        _opts=
+        if [ -n "${enh_st_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.enh_s2t_train --print_config --optim adam
+            _opts+="--config ${enh_st_config} "
+        fi
+
+        _feats_type="$(<${_enh_st_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            # "sound" supports "wav", "flac", etc.
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                _type=sound
+            fi
+            _fold_length="$((enh_st_speech_fold_length * 100))"
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _fold_length="${enh_st_speech_fold_length}"
+            _input_size="$(<${_enh_st_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            # Default normalization is utterance_mvn and changes to global_mvn
+            _opts+="--normalize=global_mvn --normalize_conf stats_file=${enh_st_stats_dir}/train/feats_stats.npz "
+        fi
+
+        if [ "${num_splits_st}" -gt 1 ]; then
+            # If you met a memory error when parsing text files, this option may help you.
+            # The corpus is split into subsets and each subset is used for training one by one in order,
+            # so the memory footprint can be limited to the memory required for each dataset.
+
+            _split_dir="${enh_st_stats_dir}/splits${num_splits_st}"
+            if [ ! -f "${_split_dir}/.done" ]; then
+                rm -f "${_split_dir}/.done"
+                ${python} -m espnet2.bin.split_scps \
+                  --scps \
+                      "${_enh_st_train_dir}/${_scp}" \
+                      "${_enh_st_train_dir}/text.${tgt_case}.${tgt_lang}" \
+                      "${_enh_st_train_dir}/text.${src_case}.${src_lang}" \
+                      "${enh_st_stats_dir}/train/speech_shape" \
+                      "${enh_st_stats_dir}/train/speech_ref1_shape" \
+                      "${enh_st_stats_dir}/train/text_shape.${tgt_token_type}" \
+                      "${enh_st_stats_dir}/train/src_text_shape.${src_token_type}" \
+                  --num_splits "${num_splits_st}" \
+                  --output_dir "${_split_dir}"
+                touch "${_split_dir}/.done"
+            else
+                log "${_split_dir}/.done exists. Spliting is skipped"
+            fi
+
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/spk1.scp,speech_ref1,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${tgt_case}.${tgt_lang},text,text "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${src_case}.${src_lang},src_text,text "
+            _opts+="--train_shape_file ${_split_dir}/speech_shape "
+            _opts+="--train_shape_file ${_split_dir}/speech_ref1_shape "
+            _opts+="--train_shape_file ${_split_dir}/text_shape.${tgt_token_type} "
+            _opts+="--train_shape_file ${_split_dir}/src_text_shape.${src_token_type} "
+            _opts+="--multiple_iterator true "
+        else
+            _opts+="--train_data_path_and_name_and_type ${_enh_st_train_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_enh_st_train_dir}/spk1.scp,speech_ref1,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_enh_st_train_dir}/text.${tgt_case}.${tgt_lang},text,text "
+            _opts+="--train_data_path_and_name_and_type ${_enh_st_train_dir}/text.${src_case}.${src_lang},src_text,text "
+            _opts+="--train_shape_file ${enh_st_stats_dir}/train/speech_shape "
+            _opts+="--train_shape_file ${enh_st_stats_dir}/train/speech_ref1_shape "
+            _opts+="--train_shape_file ${enh_st_stats_dir}/train/text_shape.${tgt_token_type} "
+            _opts+="--train_shape_file ${enh_st_stats_dir}/train/src_text_shape.${src_token_type} "
+        fi
+
+        log "Generate '${enh_st_exp}/run.sh'. You can resume the process from stage 11 using this script"
+        mkdir -p "${enh_st_exp}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${enh_st_exp}/run.sh"; chmod +x "${enh_st_exp}/run.sh"
+
+        # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+        log "ST training started... log: '${enh_st_exp}/train.log'"
+        if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+            # SGE can't include "/" in a job name
+            jobname="$(basename ${enh_st_exp})"
+        else
+            jobname="${enh_st_exp}/train.log"
+        fi
+
+        # TODO(jiatong): fix bpe
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.launch \
+            --cmd "${cuda_cmd} --name ${jobname}" \
+            --log "${enh_st_exp}"/train.log \
+            --ngpu "${ngpu}" \
+            --num_nodes "${num_nodes}" \
+            --init_file_prefix "${enh_st_exp}"/.dist_init_ \
+            --multiprocessing_distributed true -- \
+            ${python} -m espnet2.bin.enh_s2t_train \
+                --use_preprocessor true \
+                --bpemodel "${tgt_bpemodel}" \
+                --token_type "${tgt_token_type}" \
+                --token_list "${tgt_token_list}" \
+                --src_bpemodel "${src_bpemodel}" \
+                --src_token_type "${src_token_type}" \
+                --src_token_list "${src_token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/spk1.scp,speech_ref1,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_shape_file "${enh_st_stats_dir}/valid/speech_shape" \
+                --valid_shape_file "${enh_st_stats_dir}/valid/speech_ref1_shape" \
+                --valid_shape_file "${enh_st_stats_dir}/valid/text_shape.${tgt_token_type}" \
+                --valid_shape_file "${enh_st_stats_dir}/valid/src_text_shape.${src_token_type}" \
+                --resume true \
+                --init_param ${pretrained_model} \
+                --ignore_init_mismatch ${ignore_init_mismatch} \
+                --fold_length "${_fold_length}" \
+                --fold_length "${_fold_length}" \
+                --fold_length "${enh_st_text_fold_length}" \
+                --fold_length "${enh_st_text_fold_length}" \
+                --output_dir "${enh_st_exp}" \
+                ${_opts} ${enh_st_args}
+
+    fi
+else
+    log "Skip the training stages"
+fi
+
+
+if [ -n "${download_model}" ]; then
+    log "Use ${download_model} for decoding and evaluation"
+    enh_st_exp="${expdir}/${download_model}"
+    mkdir -p "${enh_st_exp}"
+
+    # If the model already exists, you can skip downloading
+    espnet_model_zoo_download --unpack true "${download_model}" > "${enh_st_exp}/config.txt"
+
+    # Get the path of each file
+    _enh_st_model_file=$(<"${enh_st_exp}/config.txt" sed -e "s/.*'enh_s2t_model_file': '\([^']*\)'.*$/\1/")
+    _enh_st_train_config=$(<"${enh_st_exp}/config.txt" sed -e "s/.*'enh_s2t_train_config': '\([^']*\)'.*$/\1/")
+
+    # Create symbolic links
+    ln -sf "${_enh_st_model_file}" "${enh_st_exp}"
+    ln -sf "${_enh_st_train_config}" "${enh_st_exp}"
+    inference_enh_st_model=$(basename "${_enh_st_model_file}")
+
+    if [ "$(<${enh_st_exp}/config.txt grep -c lm_file)" -gt 0 ]; then
+        _lm_file=$(<"${enh_st_exp}/config.txt" sed -e "s/.*'lm_file': '\([^']*\)'.*$/\1/")
+        _lm_train_config=$(<"${enh_st_exp}/config.txt" sed -e "s/.*'lm_train_config': '\([^']*\)'.*$/\1/")
+
+        lm_exp="${expdir}/${download_model}/lm"
+        mkdir -p "${lm_exp}"
+
+        ln -sf "${_lm_file}" "${lm_exp}"
+        ln -sf "${_lm_train_config}" "${lm_exp}"
+        inference_lm=$(basename "${_lm_file}")
+    fi
+
+fi
+
+
+if ! "${skip_eval}"; then
+    if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
+        log "Stage 12: Decoding: training_dir=${enh_st_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        if "${use_lm}"; then
+            if "${use_word_lm}"; then
+                _opts+="--word_lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--word_lm_file ${lm_exp}/${inference_lm} "
+            else
+                _opts+="--lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            fi
+        fi
+        if "${use_ngram}"; then
+             _opts+="--ngram_file ${ngram_exp}/${inference_ngram}"
+        fi
+
+        # 2. Generate run.sh
+        log "Generate '${enh_st_exp}/${inference_tag}/run.sh'. You can resume the process from stage 12 using this script"
+        mkdir -p "${enh_st_exp}/${inference_tag}"; echo "${run_args} --stage 12 \"\$@\"; exit \$?" > "${enh_st_exp}/${inference_tag}/run.sh"; chmod +x "${enh_st_exp}/${inference_tag}/run.sh"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${enh_st_exp}/${inference_tag}/${dset}"
+            _logdir="${_dir}/logdir"
+            mkdir -p "${_logdir}"
+
+            _feats_type="$(<${_data}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _scp=wav.scp
+                if [[ "${audio_format}" == *ark* ]]; then
+                    _type=kaldi_ark
+                else
+                    _type=sound
+                fi
+            else
+                _scp=feats.scp
+                _type=kaldi_ark
+            fi
+
+            # 1. Split the key file
+            key_file=${_data}/${_scp}
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            st_inference_tool="espnet2.bin.st_inference"
+
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit decoding jobs
+            log "Decoding started... log: '${_logdir}/st_inference.*.log'"
+            # shellcheck disable=SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/st_inference.JOB.log \
+                ${python} -m ${st_inference_tool} \
+                    --batch_size ${batch_size} \
+                    --ngpu "${_ngpu}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --st_train_config "${enh_st_exp}"/config.yaml \
+                    --st_model_file "${enh_st_exp}"/"${inference_enh_st_model}" \
+                    --output_dir "${_logdir}"/output.JOB \
+                    ${_opts} ${inference_args}
+
+            # 3. Concatenates the output files from each jobs
+            for f in token token_int score text; do
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | LC_ALL=C sort -k1 >"${_dir}/${f}"
+            done
+
+        done
+    fi
+    if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
+        log "Stage 13: Enhance Speech: training_dir=${enh_st_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+
+        # 2. Generate run.sh
+        log "Generate '${enh_st_exp}/${inference_tag}/run.sh'. You can resume the process from stage 13 using this script"
+        mkdir -p "${enh_st_exp}/${inference_tag}"; echo "${run_args} --stage 13 \"\$@\"; exit \$?" > "${enh_st_exp}/${inference_tag}/run.sh"; chmod +x "${enh_st_exp}/${inference_tag}/run.sh"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${enh_st_exp}/${inference_tag}/${dset}"
+            _logdir="${_dir}/logdir"
+            mkdir -p "${_logdir}"
+
+            _scp=wav.scp
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                _type=sound
+            fi
+
+            # 1. Split the key file
+            key_file=${_data}/${_scp}
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit inference jobs
+            log "Enhancement started... log: '${_logdir}/enh_inference.*.log'"
+            # shellcheck disable=SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/enh_inference.JOB.log \
+                ${python} -m espnet2.bin.enh_inference \
+                    --enh_s2t_task true \
+                    --ngpu "${_ngpu}" \
+                    --fs "${fs}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},speech_mix,${_type}" \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --train_config "${enh_st_exp}"/config.yaml \
+                    --model_file "${enh_st_exp}"/"${inference_enh_st_model}" \
+                    --output_dir "${_logdir}"/output.JOB \
+                    ${_opts} ${enh_inference_args}
+
+            # 3. Concatenates the output files from each jobs
+            _spk_list=" "
+            for i in $(seq ${spk_num}); do
+                _spk_list+="spk${i} "
+            done
+
+            for spk in ${_spk_list}; do
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/${spk}.scp"
+                done | LC_ALL=C sort -k1 > "${_dir}/${spk}.scp"
+            done
+        done
+    fi
+
+
+    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
+        log "Stage 14: Scoring Translation"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${st_exp}/${inference_tag}/${dset}"
+
+            # TODO(jiatong): add asr scoring and inference
+
+            _scoredir="${_dir}/score_bleu"
+            mkdir -p "${_scoredir}"
+
+            paste \
+                <(<"${_data}/text.${tgt_case}.${tgt_lang}" \
+                    ${python} -m espnet2.bin.tokenize_text  \
+                        -f 2- --input - --output - \
+                        --token_type word \
+                        --non_linguistic_symbols "${nlsyms_txt}" \
+                        --remove_non_linguistic_symbols true \
+                        --cleaner "${cleaner}" \
+                        ) \
+                <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                    >"${_scoredir}/ref.trn.org"
+
+            # NOTE(kamo): Don't use cleaner for hyp
+            paste \
+                <(<"${_dir}/text"  \
+                        ${python} -m espnet2.bin.tokenize_text  \
+                            -f 2- --input - --output - \
+                            --token_type word \
+                            --non_linguistic_symbols "${nlsyms_txt}" \
+                            --remove_non_linguistic_symbols true \
+                            ) \
+                <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                    >"${_scoredir}/hyp.trn.org"
+
+            # remove utterance id
+            perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn"
+            perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn"
+
+            # detokenizer
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok"
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok"
+
+            if [ ${tgt_case} = "tc" ]; then
+                echo "Case sensitive BLEU result (single-reference)" >> ${_scoredir}/result.tc.txt
+                sacrebleu "${_scoredir}/ref.trn.detok" \
+                          -i "${_scoredir}/hyp.trn.detok" \
+                          -m bleu chrf ter \
+                          >> ${_scoredir}/result.tc.txt
+
+                log "Write a case-sensitive BLEU (single-reference) result in ${_scoredir}/result.tc.txt"
+            fi
+
+            # detokenize & remove punctuation except apostrophe
+            remove_punctuation.pl < "${_scoredir}/ref.trn.detok" > "${_scoredir}/ref.trn.detok.lc.rm"
+            remove_punctuation.pl < "${_scoredir}/hyp.trn.detok" > "${_scoredir}/hyp.trn.detok.lc.rm"
+            echo "Case insensitive BLEU result (single-reference)" >> ${_scoredir}/result.lc.txt
+            sacrebleu -lc "${_scoredir}/ref.trn.detok.lc.rm" \
+                      -i "${_scoredir}/hyp.trn.detok.lc.rm" \
+                      -m bleu chrf ter \
+                      >> ${_scoredir}/result.lc.txt
+            log "Write a case-insensitve BLEU (single-reference) result in ${_scoredir}/result.lc.txt"
+
+            # process multi-references cases
+            multi_references=$(ls "${_data}/text.${tgt_case}.${tgt_lang}".* || echo "")
+            if [ "${multi_references}" != "" ]; then
+                case_sensitive_refs=""
+                case_insensitive_refs=""
+                for multi_reference in ${multi_references}; do
+                    ref_idx="${multi_reference##*.}"
+                    paste \
+                        <(<${multi_reference} \
+                            ${python} -m espnet2.bin.tokenize_text  \
+                                -f 2- --input - --output - \
+                                --token_type word \
+                                --non_linguistic_symbols "${nlsyms_txt}" \
+                                --remove_non_linguistic_symbols true \
+                                --cleaner "${cleaner}" \
+                                ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/ref.trn.org.${ref_idx}"
+
+                    perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}"
+                    detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}"
+                    remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
+                    case_sensitive_refs="${case_sensitive_refs} ${_scoredir}/ref.trn.detok.${ref_idx}"
+                    case_insensitive_refs="${case_insensitive_refs} ${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
+                done
+
+                if [ ${tgt_case} = "tc" ]; then
+                    echo "Case sensitive BLEU result (multi-references)" >> ${_scoredir}/result.tc.txt
+                    sacrebleu ${case_sensitive_refs} \
+                        -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
+                        >> ${_scoredir}/result.tc.txt
+                    log "Write a case-sensitve BLEU (multi-reference) result in ${_scoredir}/result.tc.txt"
+                fi
+
+                echo "Case insensitive BLEU result (multi-references)" >> ${_scoredir}/result.lc.txt
+                sacrebleu -lc ${case_insensitive_refs} \
+                    -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
+                    >> ${_scoredir}/result.lc.txt
+                log "Write a case-insensitve BLEU (multi-reference) result in ${_scoredir}/result.lc.txt"
+            fi
+        done
+
+        # Show results in Markdown syntax
+        scripts/utils/show_translation_result.sh --case $tgt_case "${enh_st_exp}" > "${enh_st_exp}"/RESULTS.md
+        cat "${enh_st_exp}"/RESULTS.md
+
+    fi
+
+    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
+        log "Stage 15: Scoring Enhancement"
+        _cmd=${decode_cmd}
+
+        # score_obs=true: Scoring for observation signal
+        # score_obs=false: Scoring for enhanced signal
+        # for score_obs in true false; do
+        for score_obs in true false; do
+            # Peform only at the first time for observation
+            if "${score_obs}" && [ -e "${data_feats}/RESULTS.md" ]; then
+                log "${data_feats}/RESULTS.md already exists. The scoring for observation will be skipped"
+                continue
+            fi
+
+            for dset in ${test_sets}; do
+                _data="${data_feats}/${dset}"
+                if "${score_obs}"; then
+                    _dir="${data_feats}/${dset}/scoring_enh"
+                else
+                    _dir="${enh_st_exp}/${inference_tag}/${dset}/scoring_enh"
+                fi
+
+                _logdir="${_dir}/logdir"
+                mkdir -p "${_logdir}"
+
+                # 1. Split the key file
+                key_file=${_data}/wav.scp
+                split_scps=""
+                _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+                for n in $(seq "${_nj}"); do
+                    split_scps+=" ${_logdir}/keys.${n}.scp"
+                done
+                # shellcheck disable=SC2086
+                utils/split_scp.pl "${key_file}" ${split_scps}
+
+                _ref_scp=
+                for spk in $(seq "${spk_num}"); do
+                    _ref_scp+="--ref_scp ${_data}/spk${spk}.scp "
+                done
+                _inf_scp=
+                for spk in $(seq "${spk_num}"); do
+                    if "${score_obs}"; then
+                        # To compute the score of observation, input original wav.scp
+                        _inf_scp+="--inf_scp ${data_feats}/${dset}/wav.scp "
+                    else
+                        _inf_scp+="--inf_scp ${enh_st_exp}/${inference_tag}/${dset}/spk${spk}.scp "
+                    fi
+                done
+
+                # 2. Submit scoring jobs
+                log "Scoring started... log: '${_logdir}/enh_scoring.*.log'"
+                # shellcheck disable=SC2086
+                ${_cmd} JOB=1:"${_nj}" "${_logdir}"/enh_scoring.JOB.log \
+                    ${python} -m espnet2.bin.enh_scoring \
+                        --key_file "${_logdir}"/keys.JOB.scp \
+                        --output_dir "${_logdir}"/output.JOB \
+                        ${_ref_scp} \
+                        ${_inf_scp} \
+                        --ref_channel ${ref_channel}
+
+                for spk in $(seq "${spk_num}"); do
+                    for protocol in ${scoring_protocol} wav; do
+                        for i in $(seq "${_nj}"); do
+                            cat "${_logdir}/output.${i}/${protocol}_spk${spk}"
+                        done | LC_ALL=C sort -k1 > "${_dir}/${protocol}_spk${spk}"
+                    done
+                done
+
+
+                for protocol in ${scoring_protocol}; do
+                    # shellcheck disable=SC2046
+                    paste $(for j in $(seq ${spk_num}); do echo "${_dir}"/"${protocol}"_spk"${j}" ; done)  |
+                    awk 'BEGIN{sum=0}
+                        {n=0;score=0;for (i=2; i<=NF; i+=2){n+=1;score+=$i}; sum+=score/n}
+                        END{printf ("%.2f\n",sum/NR)}' > "${_dir}/result_${protocol,,}.txt"
+                done
+            done
+
+            ./scripts/utils/show_enh_score.sh "${_dir}/../.." > "${_dir}/../../RESULTS_enh.md"
+        done
+        log "Evaluation result for observation: ${data_feats}/RESULTS_enh.md"
+        log "Evaluation result for enhancement: ${enh_asr_exp}/enhanced/RESULTS_enh.md"
+
+    fi
+else
+    log "Skip the evaluation stages"
+fi
+
+
+packed_model="${enh_st_exp}/${enh_st_exp##*/}_${inference_enh_st_model%.*}.zip"
+if ! "${skip_upload_hf}"; then
+    if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
+        log "Stage 16: Pack model: ${packed_model}"
+
+        _opts=
+        if "${use_lm}"; then
+            _opts+="--lm_train_config ${lm_exp}/config.yaml "
+            _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            _opts+="--option ${lm_exp}/perplexity_test/ppl "
+            _opts+="--option ${lm_exp}/images "
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            _opts+="--option ${enh_st_stats_dir}/train/feats_stats.npz "
+        fi
+        if [ "${tgt_token_type}" = bpe ]; then
+            _opts+="--option ${tgt_bpemodel} "
+            _opts+="--option ${src_bpemodel} "
+        fi
+        if [ "${nlsyms_txt}" != none ]; then
+            _opts+="--option ${nlsyms_txt} "
+        fi
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.pack enh_s2t \
+            --enh_s2t_train_config "${enh_st_exp}"/config.yaml \
+            --enh_s2t_model_file "${enh_st_exp}"/"${inference_enh_st_model}" \
+            ${_opts} \
+            --option "${enh_st_exp}"/RESULTS.md \
+            --option "${enh_st_exp}"/RESULTS_enh.md \
+            --option "${enh_st_exp}"/images \
+            --outpath "${packed_model}"
+    fi
+
+
+    if [ ${stage} -le 17 ] && [ ${stop_stage} -ge 17 ]; then
+        [ -z "${hf_repo}" ] && \
+            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
+            exit 1
+        log "Stage 17: Upload model to HuggingFace: ${hf_repo}"
+
+        gitlfs=$(git lfs --version 2> /dev/null || true)
+        [ -z "${gitlfs}" ] && \
+            log "ERROR: You need to install git-lfs first" && \
+            exit 1             
+  
+        dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
+        [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
+  
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="git checkout $(git show -s --format=%H)"
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/asr1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+  
+        # copy files in ${dir_repo}
+        unzip -o ${packed_model} -d ${dir_repo}
+        # Generate description file
+        # shellcheck disable=SC2034
+        hf_task=speech-enhancement-translation
+        # shellcheck disable=SC2034     
+        espnet_task=EnhS2T
+        # shellcheck disable=SC2034
+        task_exp=${enh_st_exp}
+        eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
+
+        this_folder=${PWD}
+        cd ${dir_repo}
+        if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Update model"
+        fi
+        git push
+        cd ${this_folder}
+    fi
+else
+    log "Skip the uploading stages"
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/enh_st1/local/path.sh b/egs2/TEMPLATE/enh_st1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/TEMPLATE/enh_st1/path.sh b/egs2/TEMPLATE/enh_st1/path.sh
new file mode 100755
index 00000000000..d2b90a67653
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/path.sh
@@ -0,0 +1,22 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+
+# You need to change or unset NCCL_SOCKET_IFNAME according to your network environment
+# https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html#nccl-socket-ifname
+export NCCL_SOCKET_IFNAME="^lo,docker,virbr,vmnet,vboxnet"
+
+# NOTE(kamo): Source at the last to overwrite the setting
+. local/path.sh
diff --git a/egs2/TEMPLATE/enh_st1/pyscripts b/egs2/TEMPLATE/enh_st1/pyscripts
new file mode 120000
index 00000000000..90e7cf60b04
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/pyscripts
@@ -0,0 +1 @@
+../asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_st1/scripts b/egs2/TEMPLATE/enh_st1/scripts
new file mode 120000
index 00000000000..b2cb12d9a6c
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/scripts
@@ -0,0 +1 @@
+../enh_asr1/scripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_st1/setup.sh b/egs2/TEMPLATE/enh_st1/setup.sh
new file mode 100755
index 00000000000..b69c326c340
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/setup.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0 <target-dir>
+EOF
+)
+
+
+if [ $# -ne 1 ]; then
+    log "${help_message}"
+    log "Error: 1 positional argument is required."
+    exit 2
+fi
+
+
+dir=$1
+mkdir -p "${dir}"
+
+if [ ! -d "${dir}"/../../TEMPLATE ]; then
+    log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory."
+    exit 1
+fi
+
+targets=""
+
+# Copy
+for f in cmd.sh conf local; do
+    target="${dir}"/../../TEMPLATE/enh_st1/"${f}"
+    cp -r "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to TEMPLATE
+for f in st.sh path.sh db.sh scripts pyscripts; do
+    target=../../TEMPLATE/enh_st1/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to Kaldi
+for f in steps utils; do
+    target=../../../tools/kaldi/egs/wsj/s5/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+log "Created: ${targets}"
diff --git a/egs2/TEMPLATE/enh_st1/steps b/egs2/TEMPLATE/enh_st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_st1/utils b/egs2/TEMPLATE/enh_st1/utils
new file mode 120000
index 00000000000..6d93948f170
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils/
\ No newline at end of file
diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh
index ac4e9dda9ae..895667e1525 100755
--- a/egs2/TEMPLATE/st1/st.sh
+++ b/egs2/TEMPLATE/st1/st.sh
@@ -618,7 +618,7 @@ if ! "${skip_data_prep}"; then
 
         elif  [ "${feats_type}" = extracted ]; then
             log "Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
-            # Assumming you don't have wav.scp, but feats.scp is created by local/data.sh instead.
+            # Assuming you don't have wav.scp, but feats.scp is created by local/data.sh instead.
 
             for dset in "${train_set}" "${valid_set}" ${test_sets}; do
                 if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
diff --git a/espnet2/asr/espnet_model.py b/espnet2/asr/espnet_model.py
index 570912e02c2..08c10182a83 100644
--- a/espnet2/asr/espnet_model.py
+++ b/espnet2/asr/espnet_model.py
@@ -161,6 +161,7 @@ def forward(
         speech_lengths: torch.Tensor,
         text: torch.Tensor,
         text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Decoder + Calc loss
 
@@ -169,6 +170,7 @@ def forward(
             speech_lengths: (Batch, )
             text: (Batch, Length)
             text_lengths: (Batch,)
+            kwargs: "utt_id" is among the input.
         """
         assert text_lengths.dim() == 1, text_lengths.shape
         # Check that batch_size is unified
@@ -287,6 +289,7 @@ def collect_feats(
         speech_lengths: torch.Tensor,
         text: torch.Tensor,
         text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         if self.extract_feats_in_collect_stats:
             feats, feats_lengths = self._extract_feats(speech, speech_lengths)
diff --git a/espnet2/asr/frontend/s3prl.py b/espnet2/asr/frontend/s3prl.py
index 4fe53970380..6a497e0fab7 100644
--- a/espnet2/asr/frontend/s3prl.py
+++ b/espnet2/asr/frontend/s3prl.py
@@ -86,10 +86,10 @@ def _get_upstream(self, frontend_conf):
 
         from s3prl.upstream.interfaces import Featurizer
 
-        if self.multilayer_feature is None:
-            feature_selection = "last_hidden_state"
-        else:
+        if self.multilayer_feature:
             feature_selection = "hidden_states"
+        else:
+            feature_selection = "last_hidden_state"
         s3prl_featurizer = Featurizer(
             upstream=s3prl_upstream,
             feature_selection=feature_selection,
@@ -123,8 +123,7 @@ def forward(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         wavs = [wav[: input_lengths[i]] for i, wav in enumerate(input)]
         self.upstream.eval()
-        with torch.no_grad():
-            feats = self.upstream(wavs)
+        feats = self.upstream(wavs)
         feats = self.featurizer(wavs, feats)
 
         if self.args.tile_factor != 1:
diff --git a/espnet2/bin/asr_inference.py b/espnet2/bin/asr_inference.py
index d5a7932b0e6..fc6d75cb488 100755
--- a/espnet2/bin/asr_inference.py
+++ b/espnet2/bin/asr_inference.py
@@ -31,6 +31,7 @@
 from espnet2.asr.transducer.beam_search_transducer import Hypothesis as TransHypothesis
 from espnet2.fileio.datadir_writer import DatadirWriter
 from espnet2.tasks.asr import ASRTask
+from espnet2.tasks.enh_s2t import EnhS2TTask
 from espnet2.tasks.lm import LMTask
 from espnet2.text.build_tokenizer import build_tokenizer
 from espnet2.text.token_id_converter import TokenIDConverter
@@ -77,14 +78,29 @@ def __init__(
         penalty: float = 0.0,
         nbest: int = 1,
         streaming: bool = False,
+        enh_s2t_task: bool = False,
     ):
         assert check_argument_types()
 
+        task = ASRTask if not enh_s2t_task else EnhS2TTask
+
         # 1. Build ASR model
         scorers = {}
-        asr_model, asr_train_args = ASRTask.build_model_from_file(
+        asr_model, asr_train_args = task.build_model_from_file(
             asr_train_config, asr_model_file, device
         )
+        if enh_s2t_task:
+            asr_model.inherite_attributes(
+                inherite_s2t_attrs=[
+                    "ctc",
+                    "decoder",
+                    "eos",
+                    "joint_network",
+                    "sos",
+                    "token_list",
+                    "use_transducer_decoder",
+                ]
+            )
         asr_model.to(dtype=getattr(torch, dtype)).eval()
 
         decoder = asr_model.decoder
@@ -348,6 +364,7 @@ def inference(
     allow_variable_data_keys: bool,
     transducer_conf: Optional[dict],
     streaming: bool,
+    enh_s2t_task: bool,
 ):
     assert check_argument_types()
     if batch_size > 1:
@@ -391,6 +408,7 @@ def inference(
         penalty=penalty,
         nbest=nbest,
         streaming=streaming,
+        enh_s2t_task=enh_s2t_task,
     )
     speech2text = Speech2Text.from_pretrained(
         model_tag=model_tag,
@@ -532,6 +550,12 @@ def get_parser():
         help="Pretrained model tag. If specify this option, *_train_config and "
         "*_file will be overwritten",
     )
+    group.add_argument(
+        "--enh_s2t_task",
+        type=str2bool,
+        default=False,
+        help="enhancement and asr joint model",
+    )
 
     group = parser.add_argument_group("Beam-search related")
     group.add_argument(
diff --git a/espnet2/bin/enh_inference.py b/espnet2/bin/enh_inference.py
index 84a37b5ff7f..577efe2df43 100755
--- a/espnet2/bin/enh_inference.py
+++ b/espnet2/bin/enh_inference.py
@@ -22,6 +22,7 @@
 from espnet2.enh.loss.wrappers.pit_solver import PITSolver
 from espnet2.fileio.sound_scp import SoundScpWriter
 from espnet2.tasks.enh import EnhancementTask
+from espnet2.tasks.enh_s2t import EnhS2TTask
 from espnet2.torch_utils.device_funcs import to_device
 from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
 from espnet2.utils import config_argparse
@@ -57,13 +58,19 @@ def __init__(
         normalize_output_wav: bool = False,
         device: str = "cpu",
         dtype: str = "float32",
+        enh_s2t_task: bool = False,
     ):
         assert check_argument_types()
 
+        task = EnhancementTask if not enh_s2t_task else EnhS2TTask
+
         # 1. Build Enh model
-        enh_model, enh_train_args = EnhancementTask.build_model_from_file(
+        enh_model, enh_train_args = task.build_model_from_file(
             train_config, model_file, device
         )
+        if enh_s2t_task:
+            enh_model = enh_model.enh_model
+
         enh_model.to(dtype=getattr(torch, dtype)).eval()
 
         self.device = device
@@ -312,6 +319,7 @@ def inference(
     show_progressbar: bool,
     ref_channel: Optional[int],
     normalize_output_wav: bool,
+    enh_s2t_task: bool,
 ):
     assert check_argument_types()
     if batch_size > 1:
@@ -344,6 +352,7 @@ def inference(
         normalize_output_wav=normalize_output_wav,
         device=device,
         dtype=dtype,
+        enh_s2t_task=enh_s2t_task,
     )
     separate_speech = SeparateSpeech.from_pretrained(
         model_tag=model_tag,
@@ -465,6 +474,12 @@ def get_parser():
         help="Pretrained model tag. If specify this option, train_config and "
         "model_file will be overwritten",
     )
+    group.add_argument(
+        "--enh_s2t_task",
+        type=str2bool,
+        default=False,
+        help="enhancement and asr joint model",
+    )
 
     group = parser.add_argument_group("Data loading related")
     group.add_argument(
diff --git a/espnet2/bin/enh_s2t_train.py b/espnet2/bin/enh_s2t_train.py
new file mode 100755
index 00000000000..93194d3696d
--- /dev/null
+++ b/espnet2/bin/enh_s2t_train.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+from espnet2.tasks.enh_s2t import EnhS2TTask
+
+
+def get_parser():
+    parser = EnhS2TTask.get_parser()
+    return parser
+
+
+def main(cmd=None):
+    r"""EnhS2T training.
+
+    Example:
+
+        % python enh_s2t_train.py enh_s2t --print_config --optim adadelta \
+                > conf/train_enh_s2t.yaml
+        % python enh_s2t_train.py --config conf/train_enh_s2t.yaml
+    """
+    EnhS2TTask.main(cmd=cmd)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/enh_train.py b/espnet2/bin/enh_train.py
index ca4708eb87e..9f535fcce6b 100755
--- a/espnet2/bin/enh_train.py
+++ b/espnet2/bin/enh_train.py
@@ -12,7 +12,7 @@ def main(cmd=None):
 
     Example:
 
-        % python enh_train.py asr --print_config --optim adadelta \
+        % python enh_train.py enh --print_config --optim adadelta \
                 > conf/train_enh.yaml
         % python enh_train.py --config conf/train_enh.yaml
     """
diff --git a/espnet2/bin/pack.py b/espnet2/bin/pack.py
index 21d7b657683..e6492445f7a 100755
--- a/espnet2/bin/pack.py
+++ b/espnet2/bin/pack.py
@@ -36,6 +36,12 @@ class DiarPackedContents(PackedContents):
     yaml_files = ["train_config"]
 
 
+class EnhS2TPackedContents(PackedContents):
+    # These names must be consistent with the argument of inference functions
+    files = ["enh_s2t_model_file", "lm_file"]
+    yaml_files = ["enh_s2t_train_config", "lm_train_config"]
+
+
 def add_arguments(parser: argparse.ArgumentParser, contents: Type[PackedContents]):
     parser.add_argument("--outpath", type=str, required=True)
     for key in contents.yaml_files:
@@ -56,6 +62,7 @@ def get_parser() -> argparse.ArgumentParser:
         ("tts", TTSPackedContents),
         ("enh", EnhPackedContents),
         ("diar", DiarPackedContents),
+        ("enh_s2t", EnhS2TPackedContents),
     ]:
         parser_asr = subparsers.add_parser(
             name,
diff --git a/espnet2/bin/st_inference.py b/espnet2/bin/st_inference.py
index 1758a3ea895..4504e5e1e75 100755
--- a/espnet2/bin/st_inference.py
+++ b/espnet2/bin/st_inference.py
@@ -23,6 +23,7 @@
 from espnet.nets.scorers.length_bonus import LengthBonus
 from espnet.utils.cli_utils import get_commandline_args
 from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.tasks.enh_s2t import EnhS2TTask
 from espnet2.tasks.lm import LMTask
 from espnet2.tasks.st import STTask
 from espnet2.text.build_tokenizer import build_tokenizer
@@ -67,14 +68,29 @@ def __init__(
         ngram_weight: float = 0.9,
         penalty: float = 0.0,
         nbest: int = 1,
+        enh_s2t_task: bool = False,
     ):
         assert check_argument_types()
 
+        task = STTask if not enh_s2t_task else EnhS2TTask
+
         # 1. Build ST model
         scorers = {}
-        st_model, st_train_args = STTask.build_model_from_file(
+        st_model, st_train_args = task.build_model_from_file(
             st_train_config, st_model_file, device
         )
+        if enh_s2t_task:
+            st_model.inherite_attributes(
+                inherite_s2t_attrs=[
+                    "ctc",
+                    "decoder",
+                    "eos",
+                    "joint_network",
+                    "sos",
+                    "token_list",
+                    "use_transducer_decoder",
+                ]
+            )
         st_model.to(dtype=getattr(torch, dtype)).eval()
 
         decoder = st_model.decoder
@@ -290,6 +306,7 @@ def inference(
     token_type: Optional[str],
     bpemodel: Optional[str],
     allow_variable_data_keys: bool,
+    enh_s2t_task: bool,
 ):
     assert check_argument_types()
     if batch_size > 1:
@@ -330,6 +347,7 @@ def inference(
         ngram_weight=ngram_weight,
         penalty=penalty,
         nbest=nbest,
+        enh_s2t_task=enh_s2t_task,
     )
     speech2text = Speech2Text.from_pretrained(
         model_tag=model_tag,
diff --git a/espnet2/diar/espnet_model.py b/espnet2/diar/espnet_model.py
index 8a59b3cb5a3..1e1d10af15e 100644
--- a/espnet2/diar/espnet_model.py
+++ b/espnet2/diar/espnet_model.py
@@ -78,6 +78,7 @@ def forward(
         speech_lengths: torch.Tensor = None,
         spk_labels: torch.Tensor = None,
         spk_labels_lengths: torch.Tensor = None,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Decoder + Calc loss
 
@@ -89,6 +90,7 @@ def forward(
                                      see in
                                      espnet2/iterators/chunk_iter_factory.py
             spk_labels: (Batch, )
+            kwargs: "utt_id" is among the input.
         """
         assert speech.shape[0] == spk_labels.shape[0], (speech.shape, spk_labels.shape)
         batch_size = speech.shape[0]
@@ -191,6 +193,7 @@ def collect_feats(
         speech_lengths: torch.Tensor,
         spk_labels: torch.Tensor = None,
         spk_labels_lengths: torch.Tensor = None,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         feats, feats_lengths = self._extract_feats(speech, speech_lengths)
         return {"feats": feats, "feats_lengths": feats_lengths}
diff --git a/espnet2/enh/espnet_enh_s2t_model.py b/espnet2/enh/espnet_enh_s2t_model.py
new file mode 100644
index 00000000000..25b56a6952e
--- /dev/null
+++ b/espnet2/enh/espnet_enh_s2t_model.py
@@ -0,0 +1,273 @@
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+import logging
+import random
+from typing import Dict
+from typing import List
+from typing import Tuple
+from typing import Union
+
+import torch
+from typeguard import check_argument_types
+
+from espnet2.asr.espnet_model import ESPnetASRModel
+from espnet2.enh.espnet_model import ESPnetEnhancementModel
+from espnet2.st.espnet_model import ESPnetSTModel
+from espnet2.torch_utils.device_funcs import force_gatherable
+from espnet2.train.abs_espnet_model import AbsESPnetModel
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import autocast
+else:
+    # Nothing to do if torch<1.6.0
+    @contextmanager
+    def autocast(enabled=True):
+        yield
+
+
+class ESPnetEnhS2TModel(AbsESPnetModel):
+    """Joint model Enhancement and Speech to Text."""
+
+    def __init__(
+        self,
+        enh_model: ESPnetEnhancementModel,
+        s2t_model: Union[ESPnetASRModel, ESPnetSTModel],
+        permutation_by_enh: bool = True,
+        calc_enh_loss: bool = True,
+        bypass_enh_prob: float = 0,  # 0 means do not bypass enhancement for all data
+    ):
+        assert check_argument_types()
+
+        super().__init__()
+        self.enh_model = enh_model
+        self.s2t_model = s2t_model  # ASR or ST model
+
+        self.bypass_enh_prob = bypass_enh_prob
+
+        self.permutation_by_enh = permutation_by_enh
+        self.calc_enh_loss = calc_enh_loss
+
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Frontend + Encoder + Decoder + Calc loss
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (
+            speech.shape[0]
+            == speech_lengths.shape[0]
+            == text.shape[0]
+            == text_lengths.shape[0]
+        ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
+
+        # additional checks with valid src_text
+        if "src_text" in kwargs:
+            src_text = kwargs["src_text"]
+            src_text_lengths = kwargs["src_text_lengths"]
+
+            if src_text is not None:
+                assert src_text_lengths.dim() == 1, src_text_lengths.shape
+                assert (
+                    text.shape[0] == src_text.shape[0] == src_text_lengths.shape[0]
+                ), (
+                    text.shape,
+                    src_text.shape,
+                    src_text_lengths.shape,
+                )
+        else:
+            src_text = None
+            src_text_lengths = None
+
+        batch_size = speech.shape[0]
+
+        # clean speech signal
+        speech_ref = None
+        if self.calc_enh_loss:
+            assert "speech_ref1" in kwargs
+            speech_ref = [kwargs["speech_ref1"]]  # [(Batch, samples)] x num_spkr
+
+        # Calculating enhancement loss
+        utt_id = kwargs.get("utt_id", None)
+        bypass_enh_flag, skip_enhloss_flag = False, False
+        if utt_id is not None:
+            # TODO(xkc): to pass category info and use predefined category list
+            if utt_id[0].endswith("SIMU"):
+                # For simulated single-/multi-speaker data
+                # feed it to Enhancement and calculate loss_enh
+                bypass_enh_flag = False
+                skip_enhloss_flag = False
+            elif utt_id[0].endswith("REAL"):
+                # For single-speaker real data
+                # feed it to Enhancement but without calculating loss_enh
+                bypass_enh_flag = False
+                skip_enhloss_flag = True
+            else:
+                # For clean data
+                # feed it to Enhancement, without calculating loss_enh
+                bypass_enh_flag = True
+                skip_enhloss_flag = True
+
+        if not self.calc_enh_loss:
+            skip_enhloss_flag = True
+
+        # Bypass the enhancement module
+        if (
+            self.training and skip_enhloss_flag and not bypass_enh_flag
+        ):  # For single-speaker real data: possibility to bypass frontend
+            if random.random() <= self.bypass_enh_prob:
+                bypass_enh_flag = True
+
+        # 1. Enhancement
+        # model forward
+        loss_enh = None
+        if not bypass_enh_flag:
+            (
+                speech_pre,
+                feature_mix,
+                feature_pre,
+                others,
+            ) = self.enh_model.forward_enhance(speech, speech_lengths)
+            # loss computation
+            if not skip_enhloss_flag:
+                loss_enh, _, _ = self.enh_model.forward_loss(
+                    speech_pre,
+                    speech_lengths,
+                    feature_mix,
+                    feature_pre,
+                    others,
+                    speech_ref,
+                )
+                loss_enh = loss_enh[0]
+        else:
+            speech_pre = [speech]
+
+        # for data-parallel
+        text = text[:, : text_lengths.max()]
+        if src_text is not None:
+            src_text = src_text[:, : src_text_lengths.max()]
+
+        # 2. ASR or ST
+        if isinstance(self.s2t_model, ESPnetASRModel):  # ASR
+            loss_asr, stats, weight = self.s2t_model(
+                speech_pre[0], speech_lengths, text, text_lengths
+            )
+        elif isinstance(self.s2t_model, ESPnetSTModel):  # ST
+            loss_asr, stats, weight = self.s2t_model(
+                speech_pre[0],
+                speech_lengths,
+                text,
+                text_lengths,
+                src_text,
+                src_text_lengths,
+            )
+        else:
+            raise NotImplementedError(f"{type(self.s2t_model)} is not supported yet.")
+
+        if loss_enh is not None:
+            loss = loss_enh + loss_asr
+        else:
+            loss = loss_asr
+
+        stats["loss"] = loss.detach() if loss is not None else None
+        stats["loss_enh"] = loss_enh.detach() if loss_enh is not None else None
+
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+
+    def collect_feats(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        **kwargs,
+    ) -> Dict[str, torch.Tensor]:
+        if self.extract_feats_in_collect_stats:
+            ret = self.s2t_model.collect_feats(
+                speech,
+                speech_lengths,
+                text,
+                text_lengths,
+                **kwargs,
+            )
+            feats, feats_lengths = ret["feats"], ret["feats_lengths"]
+        else:
+            # Generate dummy stats if extract_feats_in_collect_stats is False
+            logging.warning(
+                "Generating dummy stats for feats and feats_lengths, "
+                "because encoder_conf.extract_feats_in_collect_stats is "
+                f"{self.extract_feats_in_collect_stats}"
+            )
+            feats, feats_lengths = speech, speech_lengths
+        return {"feats": feats, "feats_lengths": feats_lengths}
+
+    def encode(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Frontend + Encoder. Note that this method is used by asr_inference.py
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+        """
+        speech_pre, feature_mix, feature_pre, others = self.enh_model.forward_enhance(
+            speech, speech_lengths
+        )
+        encoder_out, encoder_out_lens = self.s2t_model.encode(
+            speech_pre[0], speech_lengths
+        )
+
+        return encoder_out, encoder_out_lens
+
+    def nll(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ) -> torch.Tensor:
+        """Compute negative log likelihood(nll) from transformer-decoder
+
+        Normally, this function is called in batchify_nll.
+
+        Args:
+            encoder_out: (Batch, Length, Dim)
+            encoder_out_lens: (Batch,)
+            ys_pad: (Batch, Length)
+            ys_pad_lens: (Batch,)
+        """
+        return self.s2t_model.nll(
+            encoder_out,
+            encoder_out_lens,
+            ys_pad,
+            ys_pad_lens,
+        )
+
+    batchify_nll = ESPnetASRModel.batchify_nll
+
+    def inherite_attributes(
+        self,
+        inherite_enh_attrs: List[str] = [],
+        inherite_s2t_attrs: List[str] = [],
+    ):
+        assert check_argument_types()
+
+        if len(inherite_enh_attrs) > 0:
+            for attr in inherite_enh_attrs:
+                setattr(self, attr, getattr(self.enh_model, attr, None))
+        if len(inherite_s2t_attrs) > 0:
+            for attr in inherite_s2t_attrs:
+                setattr(self, attr, getattr(self.s2t_model, attr, None))
diff --git a/espnet2/enh/espnet_model.py b/espnet2/enh/espnet_model.py
index f9824471604..a2f4214babf 100644
--- a/espnet2/enh/espnet_model.py
+++ b/espnet2/enh/espnet_model.py
@@ -3,6 +3,7 @@
 from typing import Dict
 from typing import List
 from typing import Optional
+from typing import OrderedDict
 from typing import Tuple
 
 import torch
@@ -77,6 +78,7 @@ def forward(
                             because the chunk-iterator does not have the
                             speech_lengths returned. see in
                             espnet2/iterators/chunk_iter_factory.py
+            kwargs: "utt_id" is among the input.
         """
         # clean speech signal of each speaker
         speech_ref = [
@@ -136,6 +138,28 @@ def forward(
         speech_mix = speech_mix[:, : speech_lengths.max()]
 
         # model forward
+        speech_pre, feature_mix, feature_pre, others = self.forward_enhance(
+            speech_mix, speech_lengths
+        )
+
+        # loss computation
+        loss, stats, weight = self.forward_loss(
+            speech_pre,
+            speech_lengths,
+            feature_mix,
+            feature_pre,
+            others,
+            speech_ref,
+            noise_ref,
+            dereverb_speech_ref,
+        )
+        return loss, stats, weight
+
+    def forward_enhance(
+        self,
+        speech_mix: torch.Tensor,
+        speech_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         feature_mix, flens = self.encoder(speech_mix, speech_lengths)
         feature_pre, flens, others = self.separator(feature_mix, flens)
         if feature_pre is not None:
@@ -144,7 +168,19 @@ def forward(
             # some models (e.g. neural beamformer trained with mask loss)
             # do not predict time-domain signal in the training stage
             speech_pre = None
+        return speech_pre, feature_mix, feature_pre, others
 
+    def forward_loss(
+        self,
+        speech_pre: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        feature_mix: torch.Tensor,
+        feature_pre: torch.Tensor,
+        others: OrderedDict,
+        speech_ref: torch.Tensor,
+        noise_ref: torch.Tensor = None,
+        dereverb_speech_ref: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         loss = 0.0
         stats = dict()
         o = {}
@@ -185,6 +221,7 @@ def forward(
         stats["loss"] = loss.detach()
 
         # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        batch_size = speech_ref[0].shape[0]
         loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
         return loss, stats, weight
 
diff --git a/espnet2/gan_tts/espnet_model.py b/espnet2/gan_tts/espnet_model.py
index cbb39cc682b..34ca845f0fd 100644
--- a/espnet2/gan_tts/espnet_model.py
+++ b/espnet2/gan_tts/espnet_model.py
@@ -74,6 +74,7 @@ def forward(
         sids: Optional[torch.Tensor] = None,
         lids: Optional[torch.Tensor] = None,
         forward_generator: bool = True,
+        **kwargs,
     ) -> Dict[str, Any]:
         """Return generator or discriminator loss with dict format.
 
@@ -92,6 +93,7 @@ def forward(
             sids (Optional[Tensor]): Speaker ID tensor (B, 1).
             lids (Optional[Tensor]): Language ID tensor (B, 1).
             forward_generator (bool): Whether to forward generator.
+            kwargs: "utt_id" is among the input.
 
         Returns:
             Dict[str, Any]:
@@ -176,6 +178,7 @@ def collect_feats(
         spembs: Optional[torch.Tensor] = None,
         sids: Optional[torch.Tensor] = None,
         lids: Optional[torch.Tensor] = None,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         """Calculate features and return them as a dict.
 
diff --git a/espnet2/hubert/espnet_model.py b/espnet2/hubert/espnet_model.py
index bc5bd451bfd..4fa775841bc 100644
--- a/espnet2/hubert/espnet_model.py
+++ b/espnet2/hubert/espnet_model.py
@@ -97,6 +97,7 @@ def forward(
         speech_lengths: torch.Tensor,
         text: torch.Tensor,
         text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Calc loss
 
@@ -105,6 +106,7 @@ def forward(
             speech_lengths: (Batch, )
             text: (Batch, Length)
             text_lengths: (Batch,)
+            kwargs: "utt_id" is among the input.
         """
         assert text_lengths.dim() == 1, text_lengths.shape
         # Check that batch_size is unified
@@ -144,6 +146,7 @@ def collect_feats(
         speech_lengths: torch.Tensor,
         text: torch.Tensor,
         text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         feats, feats_lengths = self._extract_feats(speech, speech_lengths)
         return {"feats": feats, "feats_lengths": feats_lengths}
diff --git a/espnet2/lm/espnet_model.py b/espnet2/lm/espnet_model.py
index 0309ee4ffb0..de6cd114a25 100644
--- a/espnet2/lm/espnet_model.py
+++ b/espnet2/lm/espnet_model.py
@@ -114,7 +114,10 @@ def batchify_nll(
         return nll, x_lengths
 
     def forward(
-        self, text: torch.Tensor, text_lengths: torch.Tensor
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         nll, y_lengths = self.nll(text, text_lengths)
         ntokens = y_lengths.sum()
@@ -126,6 +129,9 @@ def forward(
         return loss, stats, weight
 
     def collect_feats(
-        self, text: torch.Tensor, text_lengths: torch.Tensor
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         return {}
diff --git a/espnet2/main_funcs/calculate_all_attentions.py b/espnet2/main_funcs/calculate_all_attentions.py
index ed53d2b89c5..52fe045779b 100644
--- a/espnet2/main_funcs/calculate_all_attentions.py
+++ b/espnet2/main_funcs/calculate_all_attentions.py
@@ -107,7 +107,7 @@ def hook(module, input, output, name=name):
     # Batch-mode can't be used to keep requirements small for each models.
     keys = []
     for k in batch:
-        if not k.endswith("_lengths"):
+        if not (k.endswith("_lengths") or k in ["utt_id"]):
             keys.append(k)
 
     return_dict = defaultdict(list)
@@ -128,6 +128,10 @@ def hook(module, input, output, name=name):
                 if k + "_lengths" in batch
             }
         )
+
+        if "utt_id" in batch:
+            _sample["utt_id"] = batch["utt_id"]
+
         model(**_sample)
 
         # Derive the attention results
diff --git a/espnet2/mt/espnet_model.py b/espnet2/mt/espnet_model.py
index f93b5d417b2..953d5bc02f8 100644
--- a/espnet2/mt/espnet_model.py
+++ b/espnet2/mt/espnet_model.py
@@ -120,6 +120,7 @@ def forward(
         text_lengths: torch.Tensor,
         src_text: torch.Tensor,
         src_text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Decoder + Calc loss
 
@@ -128,6 +129,7 @@ def forward(
             text_lengths: (Batch,)
             src_text: (Batch, length)
             src_text_lengths: (Batch,)
+            kwargs: "utt_id" is among the input.
         """
         assert text_lengths.dim() == 1, text_lengths.shape
         # Check that batch_size is unified
@@ -171,6 +173,7 @@ def collect_feats(
         text_lengths: torch.Tensor,
         src_text: torch.Tensor,
         src_text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         if self.extract_feats_in_collect_stats:
             feats, feats_lengths = self._extract_feats(src_text, src_text_lengths)
diff --git a/espnet2/st/espnet_model.py b/espnet2/st/espnet_model.py
index eb4a707f6ca..e298ef1822d 100644
--- a/espnet2/st/espnet_model.py
+++ b/espnet2/st/espnet_model.py
@@ -167,6 +167,7 @@ def forward(
         text_lengths: torch.Tensor,
         src_text: Optional[torch.Tensor],
         src_text_lengths: Optional[torch.Tensor],
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Decoder + Calc loss
 
@@ -177,6 +178,7 @@ def forward(
             text_lengths: (Batch,)
             src_text: (Batch, length)
             src_text_lengths: (Batch,)
+            kwargs: "utt_id" is among the input.
         """
         assert text_lengths.dim() == 1, text_lengths.shape
         # Check that batch_size is unified
@@ -287,6 +289,7 @@ def collect_feats(
         text_lengths: torch.Tensor,
         src_text: Optional[torch.Tensor],
         src_text_lengths: Optional[torch.Tensor],
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         if self.extract_feats_in_collect_stats:
             feats, feats_lengths = self._extract_feats(speech, speech_lengths)
diff --git a/espnet2/tasks/enh_asr.py b/espnet2/tasks/enh_asr.py
deleted file mode 100644
index c452ab2201d..00000000000
--- a/espnet2/tasks/enh_asr.py
+++ /dev/null
@@ -1,369 +0,0 @@
-import argparse
-import logging
-from typing import Callable
-from typing import Collection
-from typing import Dict
-from typing import List
-from typing import Optional
-from typing import Tuple
-
-import numpy as np
-import torch
-from typeguard import check_argument_types
-from typeguard import check_return_type
-
-from espnet2.asr.ctc import CTC
-from espnet2.asr.decoder.abs_decoder import AbsDecoder
-from espnet2.asr.decoder.rnn_decoder import RNNDecoder
-from espnet2.asr.decoder.transformer_decoder import TransformerDecoder
-from espnet2.asr.encoder.abs_encoder import AbsEncoder
-from espnet2.asr.encoder.rnn_encoder import RNNEncoder
-from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
-from espnet2.asr.encoder.vgg_rnn_encoder import VGGRNNEncoder
-from espnet2.asr.espnet_joint_model import ESPnetEnhASRModel
-from espnet2.asr.espnet_model import ESPnetASRModel
-from espnet2.asr.frontend.abs_frontend import AbsFrontend
-from espnet2.asr.frontend.default import DefaultFrontend
-from espnet2.asr.specaug.abs_specaug import AbsSpecAug
-from espnet2.asr.specaug.specaug import SpecAug
-from espnet2.enh.abs_enh import AbsEnhancement
-from espnet2.enh.espnet_model import ESPnetEnhancementModel
-from espnet2.enh.nets.beamformer_net import BeamformerNet
-from espnet2.enh.nets.tasnet import TasNet
-from espnet2.enh.nets.tf_mask_net import TFMaskingNet
-from espnet2.layers.abs_normalize import AbsNormalize
-from espnet2.layers.global_mvn import GlobalMVN
-from espnet2.layers.utterance_mvn import UtteranceMVN
-from espnet2.tasks.abs_task import AbsTask
-from espnet2.text.phoneme_tokenizer import g2p_choices
-from espnet2.torch_utils.initialize import initialize
-from espnet2.train.class_choices import ClassChoices
-from espnet2.train.collate_fn import CommonCollateFn
-from espnet2.train.preprocessor import CommonPreprocessor_multi
-from espnet2.train.trainer import Trainer
-from espnet2.utils.get_default_kwargs import get_default_kwargs
-from espnet2.utils.nested_dict_action import NestedDictAction
-from espnet2.utils.types import int_or_none
-from espnet2.utils.types import str2bool
-from espnet2.utils.types import str_or_none
-
-enh_choices = ClassChoices(
-    name="enh",
-    classes=dict(tf_masking=TFMaskingNet, tasnet=TasNet, wpe_beamformer=BeamformerNet),
-    type_check=AbsEnhancement,
-    default="tf_masking",
-)
-frontend_choices = ClassChoices(
-    name="frontend",
-    classes=dict(default=DefaultFrontend),
-    type_check=AbsFrontend,
-    default="default",
-)
-specaug_choices = ClassChoices(
-    name="specaug",
-    classes=dict(specaug=SpecAug),
-    type_check=AbsSpecAug,
-    default=None,
-    optional=True,
-)
-normalize_choices = ClassChoices(
-    "normalize",
-    classes=dict(
-        global_mvn=GlobalMVN,
-        utterance_mvn=UtteranceMVN,
-    ),
-    type_check=AbsNormalize,
-    default="utterance_mvn",
-    optional=True,
-)
-encoder_choices = ClassChoices(
-    "encoder",
-    classes=dict(
-        transformer=TransformerEncoder,
-        vgg_rnn=VGGRNNEncoder,
-        rnn=RNNEncoder,
-    ),
-    type_check=AbsEncoder,
-    default="rnn",
-)
-decoder_choices = ClassChoices(
-    "decoder",
-    classes=dict(transformer=TransformerDecoder, rnn=RNNDecoder),
-    type_check=AbsDecoder,
-    default="rnn",
-)
-
-MAX_REFERENCE_NUM = 100
-
-
-class ASRTask(AbsTask):
-    # If you need more than one optimizers, change this value
-    num_optimizers: int = 1
-
-    # Add variable objects configurations
-    class_choices_list = [
-        # --enh and --enh_conf
-        enh_choices,
-        # --frontend and --frontend_conf
-        frontend_choices,
-        # --specaug and --specaug_conf
-        specaug_choices,
-        # --normalize and --normalize_conf
-        normalize_choices,
-        # --encoder and --encoder_conf
-        encoder_choices,
-        # --decoder and --decoder_conf
-        decoder_choices,
-    ]
-
-    # If you need to modify train() or eval() procedures, change Trainer class here
-    trainer = Trainer
-
-    @classmethod
-    def add_task_arguments(cls, parser: argparse.ArgumentParser):
-        group = parser.add_argument_group(description="Task related")
-
-        # NOTE(kamo): add_arguments(..., required=True) can't be used
-        # to provide --print_config mode. Instead of it, do as
-        required = parser.get_default("required")
-        required += ["token_list"]
-
-        group.add_argument(
-            "--token_list",
-            type=str_or_none,
-            default=None,
-            help="A text mapping int-id to token",
-        )
-        group.add_argument(
-            "--init",
-            type=lambda x: str_or_none(x.lower()),
-            default=None,
-            help="The initialization method",
-            choices=[
-                "chainer",
-                "xavier_uniform",
-                "xavier_normal",
-                "kaiming_uniform",
-                "kaiming_normal",
-                None,
-            ],
-        )
-
-        group.add_argument(
-            "--input_size",
-            type=int_or_none,
-            default=None,
-            help="The number of input dimension of the feature",
-        )
-
-        group.add_argument(
-            "--ctc_conf",
-            action=NestedDictAction,
-            default=get_default_kwargs(CTC),
-            help="The keyword arguments for CTC class.",
-        )
-        group.add_argument(
-            "--asr_model_conf",
-            action=NestedDictAction,
-            default=get_default_kwargs(ESPnetASRModel),
-            help="The keyword arguments for model class.",
-        )
-
-        group.add_argument(
-            "--enh_model_conf",
-            action=NestedDictAction,
-            default=get_default_kwargs(ESPnetEnhancementModel),
-            help="The keyword arguments for model class.",
-        )
-
-        group = parser.add_argument_group(description="Preprocess related")
-        group.add_argument(
-            "--use_preprocessor",
-            type=str2bool,
-            default=False,
-            help="Apply preprocessing to data or not",
-        )
-        group.add_argument(
-            "--token_type",
-            type=str,
-            default="bpe",
-            choices=["bpe", "char", "word", "phn"],
-            help="The text will be tokenized " "in the specified level token",
-        )
-        group.add_argument(
-            "--bpemodel",
-            type=str_or_none,
-            default=None,
-            help="The model file of sentencepiece",
-        )
-        parser.add_argument(
-            "--non_linguistic_symbols",
-            type=str_or_none,
-            help="non_linguistic_symbols file path",
-        )
-        parser.add_argument(
-            "--cleaner",
-            type=str_or_none,
-            choices=[None, "tacotron", "jaconv", "vietnamese"],
-            default=None,
-            help="Apply text cleaning",
-        )
-        parser.add_argument(
-            "--g2p",
-            type=str_or_none,
-            choices=g2p_choices,
-            default=None,
-            help="Specify g2p method if --token_type=phn",
-        )
-
-        for class_choices in cls.class_choices_list:
-            # Append --<name> and --<name>_conf.
-            # e.g. --encoder and --encoder_conf
-            class_choices.add_arguments(group)
-
-    @classmethod
-    def build_collate_fn(
-        cls, args: argparse.Namespace, train: bool
-    ) -> Callable[
-        [Collection[Tuple[str, Dict[str, np.ndarray]]]],
-        Tuple[List[str], Dict[str, torch.Tensor]],
-    ]:
-        assert check_argument_types()
-        # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol
-        return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
-
-    @classmethod
-    def build_preprocess_fn(
-        cls, args: argparse.Namespace, train: bool
-    ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
-        assert check_argument_types()
-        # TODO(Jing): ask Kamo if it ok to support several args,
-        # like text_name = 'text_ref1' and 'text_ref2'
-        if args.use_preprocessor:
-            retval = CommonPreprocessor_multi(
-                train=train,
-                token_type=args.token_type,
-                token_list=args.token_list,
-                bpemodel=args.bpemodel,
-                non_linguistic_symbols=args.non_linguistic_symbols,
-                text_name=["text_ref1", "text_ref2"],
-                text_cleaner=args.cleaner,
-                g2p_type=args.g2p,
-            )
-        else:
-            retval = None
-        assert check_return_type(retval)
-        return retval
-
-    @classmethod
-    def required_data_names(
-        cls, train: bool = True, inference: bool = False
-    ) -> Tuple[str, ...]:
-        if not inference:
-            retval = ("speech_mix", "speech_ref1", "text_ref1")
-        else:
-            # Recognition mode
-            retval = ("speech_mix",)
-        return retval
-
-    @classmethod
-    def optional_data_names(
-        cls, train: bool = True, inference: bool = False
-    ) -> Tuple[str, ...]:
-        retval = ["dereverb_ref"]
-        retval += ["speech_ref{}".format(n) for n in range(2, MAX_REFERENCE_NUM + 1)]
-        retval += ["text_ref{}".format(n) for n in range(2, MAX_REFERENCE_NUM + 1)]
-        retval += ["noise_ref{}".format(n) for n in range(1, MAX_REFERENCE_NUM + 1)]
-        retval = tuple(retval)
-        assert check_return_type(retval)
-        return retval
-
-    @classmethod
-    def build_model(cls, args: argparse.Namespace) -> ESPnetEnhASRModel:
-        assert check_argument_types()
-        if isinstance(args.token_list, str):
-            with open(args.token_list, encoding="utf-8") as f:
-                token_list = [line.rstrip() for line in f]
-
-            # Overwriting token_list to keep it as "portable".
-            args.token_list = list(token_list)
-        elif isinstance(args.token_list, (tuple, list)):
-            token_list = list(args.token_list)
-        else:
-            raise RuntimeError("token_list must be str or list")
-        vocab_size = len(token_list)
-        logging.info(f"Vocabulary size: {vocab_size }")
-
-        # 0. Build pre enhancement model
-        enh_model = enh_choices.get_class(args.enh)(**args.enh_conf)
-
-        # 1. frontend
-        if args.input_size is None:
-            # Extract features in the model
-            frontend_class = frontend_choices.get_class(args.frontend)
-            frontend = frontend_class(**args.frontend_conf)
-            input_size = frontend.output_size()
-        else:
-            # Give features from data-loader
-            args.frontend = None
-            args.frontend_conf = {}
-            frontend = None
-            input_size = args.input_size
-
-        # 2. Data augmentation for spectrogram
-        if args.specaug is not None:
-            specaug_class = specaug_choices.get_class(args.specaug)
-            specaug = specaug_class(**args.specaug_conf)
-        else:
-            specaug = None
-
-        # 3. Normalization layer
-        if args.normalize is not None:
-            normalize_class = normalize_choices.get_class(args.normalize)
-            normalize = normalize_class(**args.normalize_conf)
-        else:
-            normalize = None
-
-        # 4. Encoder
-        encoder_class = encoder_choices.get_class(args.encoder)
-        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
-
-        # 5. Decoder
-        decoder_class = decoder_choices.get_class(args.decoder)
-
-        decoder = decoder_class(
-            vocab_size=vocab_size,
-            encoder_output_size=encoder.output_size(),
-            **args.decoder_conf,
-        )
-
-        # 6. CTC
-        ctc = CTC(
-            odim=vocab_size, encoder_output_size=encoder.output_size(), **args.ctc_conf
-        )
-
-        # 7. RNN-T Decoder (Not implemented)
-        rnnt_decoder = None
-
-        # 8. Build model
-        model = ESPnetEnhASRModel(
-            vocab_size=vocab_size,
-            enh=enh_model,
-            frontend=frontend,
-            specaug=specaug,
-            normalize=normalize,
-            encoder=encoder,
-            decoder=decoder,
-            ctc=ctc,
-            rnnt_decoder=rnnt_decoder,
-            token_list=token_list,
-            **args.asr_model_conf,
-        )
-
-        # FIXME(kamo): Should be done in model?
-        # 9. Initialize
-        if args.init is not None:
-            initialize(model, args.init)
-
-        assert check_return_type(model)
-        return model
diff --git a/espnet2/tasks/enh_s2t.py b/espnet2/tasks/enh_s2t.py
new file mode 100644
index 00000000000..d6a20bac700
--- /dev/null
+++ b/espnet2/tasks/enh_s2t.py
@@ -0,0 +1,475 @@
+import argparse
+import copy
+import logging
+from typing import Callable
+from typing import Collection
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+from espnet2.asr.ctc import CTC
+from espnet2.asr.espnet_model import ESPnetASRModel
+from espnet2.enh.espnet_enh_s2t_model import ESPnetEnhS2TModel
+from espnet2.enh.espnet_model import ESPnetEnhancementModel
+from espnet2.tasks.abs_task import AbsTask
+from espnet2.tasks.asr import ASRTask
+from espnet2.tasks.asr import decoder_choices as asr_decoder_choices_
+from espnet2.tasks.asr import encoder_choices as asr_encoder_choices_
+from espnet2.tasks.asr import frontend_choices
+from espnet2.tasks.asr import normalize_choices
+from espnet2.tasks.asr import postencoder_choices as asr_postencoder_choices_
+from espnet2.tasks.asr import preencoder_choices as asr_preencoder_choices_
+from espnet2.tasks.asr import specaug_choices
+from espnet2.tasks.enh import decoder_choices as enh_decoder_choices_
+from espnet2.tasks.enh import encoder_choices as enh_encoder_choices_
+from espnet2.tasks.enh import EnhancementTask
+from espnet2.tasks.enh import separator_choices as enh_separator_choices_
+from espnet2.tasks.st import decoder_choices as st_decoder_choices_
+from espnet2.tasks.st import encoder_choices as st_encoder_choices_
+from espnet2.tasks.st import extra_asr_decoder_choices as st_extra_asr_decoder_choices_
+from espnet2.tasks.st import extra_mt_decoder_choices as st_extra_mt_decoder_choices_
+from espnet2.tasks.st import postencoder_choices as st_postencoder_choices_
+from espnet2.tasks.st import preencoder_choices as st_preencoder_choices_
+from espnet2.tasks.st import STTask
+from espnet2.text.phoneme_tokenizer import g2p_choices
+from espnet2.torch_utils.initialize import initialize
+from espnet2.train.collate_fn import CommonCollateFn
+from espnet2.train.preprocessor import CommonPreprocessor_multi
+from espnet2.train.preprocessor import MutliTokenizerCommonPreprocessor
+from espnet2.train.trainer import Trainer
+from espnet2.utils.get_default_kwargs import get_default_kwargs
+from espnet2.utils.nested_dict_action import NestedDictAction
+from espnet2.utils.types import int_or_none
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str_or_none
+
+
+# Enhancement
+enh_encoder_choices = copy.deepcopy(enh_encoder_choices_)
+enh_encoder_choices.name = "enh_encoder"
+enh_decoder_choices = copy.deepcopy(enh_decoder_choices_)
+enh_decoder_choices.name = "enh_decoder"
+enh_separator_choices = copy.deepcopy(enh_separator_choices_)
+enh_separator_choices.name = "enh_separator"
+
+# ASR (also SLU)
+asr_preencoder_choices = copy.deepcopy(asr_preencoder_choices_)
+asr_preencoder_choices.name = "asr_preencoder"
+asr_encoder_choices = copy.deepcopy(asr_encoder_choices_)
+asr_encoder_choices.name = "asr_encoder"
+asr_postencoder_choices = copy.deepcopy(asr_postencoder_choices_)
+asr_postencoder_choices.name = "asr_postencoder"
+asr_decoder_choices = copy.deepcopy(asr_decoder_choices_)
+asr_decoder_choices.name = "asr_decoder"
+
+# ST
+st_preencoder_choices = copy.deepcopy(st_preencoder_choices_)
+st_preencoder_choices.name = "st_preencoder"
+st_encoder_choices = copy.deepcopy(st_encoder_choices_)
+st_encoder_choices.name = "st_encoder"
+st_postencoder_choices = copy.deepcopy(st_postencoder_choices_)
+st_postencoder_choices.name = "st_postencoder"
+st_decoder_choices = copy.deepcopy(st_decoder_choices_)
+st_decoder_choices.name = "st_decoder"
+st_extra_asr_decoder_choices = copy.deepcopy(st_extra_asr_decoder_choices_)
+st_extra_asr_decoder_choices.name = "st_extra_asr_decoder"
+st_extra_mt_decoder_choices = copy.deepcopy(st_extra_mt_decoder_choices_)
+st_extra_mt_decoder_choices.name = "st_extra_mt_decoder"
+
+MAX_REFERENCE_NUM = 100
+
+name2task = dict(
+    enh=EnhancementTask,
+    asr=ASRTask,
+    st=STTask,
+)
+
+# More can be added to the following attributes
+enh_attributes = [
+    "encoder",
+    "encoder_conf",
+    "separator",
+    "separator_conf",
+    "decoder",
+    "decoder_conf",
+    "criterions",
+]
+
+asr_attributes = [
+    "token_list",
+    "input_size",
+    "frontend",
+    "frontend_conf",
+    "specaug",
+    "specaug_conf",
+    "normalize",
+    "normalize_conf",
+    "preencoder",
+    "preencoder_conf",
+    "encoder",
+    "encoder_conf",
+    "postencoder",
+    "postencoder_conf",
+    "decoder",
+    "decoder_conf",
+    "ctc_conf",
+]
+
+st_attributes = [
+    "token_list",
+    "src_token_list",
+    "input_size",
+    "frontend",
+    "frontend_conf",
+    "specaug",
+    "specaug_conf",
+    "normalize",
+    "normalize_conf",
+    "preencoder",
+    "preencoder_conf",
+    "encoder",
+    "encoder_conf",
+    "postencoder",
+    "postencoder_conf",
+    "decoder",
+    "decoder_conf",
+    "ctc_conf",
+    "extra_asr_decoder",
+    "extra_asr_decoder_conf",
+    "extra_mt_decoder",
+    "extra_mt_decoder_conf",
+]
+
+
+class EnhS2TTask(AbsTask):
+    # If you need more than one optimizers, change this value
+    num_optimizers: int = 1
+
+    # Add variable objects configurations
+    class_choices_list = [
+        # --enh_encoder and --enh_encoder_conf
+        enh_encoder_choices,
+        # --enh_separator and --enh_separator_conf
+        enh_separator_choices,
+        # --enh_decoder and --enh_decoder_conf
+        enh_decoder_choices,
+        # --frontend and --frontend_conf
+        frontend_choices,
+        # --specaug and --specaug_conf
+        specaug_choices,
+        # --normalize and --normalize_conf
+        normalize_choices,
+        # --asr_preencoder and --asr_preencoder_conf
+        asr_preencoder_choices,
+        # --asr_encoder and --asr_encoder_conf
+        asr_encoder_choices,
+        # --asr_postencoder and --asr_postencoder_conf
+        asr_postencoder_choices,
+        # --asr_decoder and --asr_decoder_conf
+        asr_decoder_choices,
+        # --st_preencoder and --st_preencoder_conf
+        st_preencoder_choices,
+        # --st_encoder and --st_encoder_conf
+        st_encoder_choices,
+        # --st_postencoder and --st_postencoder_conf
+        st_postencoder_choices,
+        # --st_decoder and --st_decoder_conf
+        st_decoder_choices,
+        # --st_extra_asr_decoder and --st_extra_asr_decoder_conf
+        st_extra_asr_decoder_choices,
+        # --st_extra_mt_decoder and --st_extra_mt_decoder_conf
+        st_extra_mt_decoder_choices,
+    ]
+
+    # If you need to modify train() or eval() procedures, change Trainer class here
+    trainer = Trainer
+
+    @classmethod
+    def add_task_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(description="Task related")
+
+        # NOTE(kamo): add_arguments(..., required=True) can't be used
+        # to provide --print_config mode. Instead of it, do as
+        required = parser.get_default("required")
+        required += ["token_list"]
+
+        group.add_argument(
+            "--token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token",
+        )
+        group.add_argument(
+            "--src_token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token (for source language)",
+        )
+        group.add_argument(
+            "--init",
+            type=lambda x: str_or_none(x.lower()),
+            default=None,
+            help="The initialization method",
+            choices=[
+                "chainer",
+                "xavier_uniform",
+                "xavier_normal",
+                "kaiming_uniform",
+                "kaiming_normal",
+                None,
+            ],
+        )
+
+        group.add_argument(
+            "--input_size",
+            type=int_or_none,
+            default=None,
+            help="The number of input dimension of the feature",
+        )
+
+        group.add_argument(
+            "--ctc_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(CTC),
+            help="The keyword arguments for CTC class.",
+        )
+
+        group.add_argument(
+            "--enh_criterions",
+            action=NestedDictAction,
+            default=[
+                {
+                    "name": "si_snr",
+                    "conf": {},
+                    "wrapper": "fixed_order",
+                    "wrapper_conf": {},
+                },
+            ],
+            help="The criterions binded with the loss wrappers.",
+        )
+
+        group.add_argument(
+            "--enh_model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(ESPnetEnhancementModel),
+            help="The keyword arguments for enh submodel class.",
+        )
+
+        group.add_argument(
+            "--asr_model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(ESPnetASRModel),
+            help="The keyword arguments for asr submodel class.",
+        )
+
+        group.add_argument(
+            "--st_model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(ESPnetEnhancementModel),
+            help="The keyword arguments for st submodel class.",
+        )
+
+        group.add_argument(
+            "--subtask_series",
+            type=str,
+            nargs="+",
+            default=("enh", "asr"),
+            choices=["enh", "asr", "st"],
+            help="The series of subtasks in the pipeline.",
+        )
+
+        group.add_argument(
+            "--model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(ESPnetEnhS2TModel),
+            help="The keyword arguments for model class.",
+        )
+
+        group = parser.add_argument_group(description="Preprocess related")
+        group.add_argument(
+            "--use_preprocessor",
+            type=str2bool,
+            default=False,
+            help="Apply preprocessing to data or not",
+        )
+        group.add_argument(
+            "--token_type",
+            type=str,
+            default="bpe",
+            choices=["bpe", "char", "word", "phn"],
+            help="The text will be tokenized " "in the specified level token",
+        )
+        group.add_argument(
+            "--bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece",
+        )
+        group.add_argument(
+            "--src_token_type",
+            type=str,
+            default="bpe",
+            choices=["bpe", "char", "word", "phn"],
+            help="The source text will be tokenized " "in the specified level token",
+        )
+        group.add_argument(
+            "--src_bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece (for source language)",
+        )
+        parser.add_argument(
+            "--non_linguistic_symbols",
+            type=str_or_none,
+            help="non_linguistic_symbols file path",
+        )
+        parser.add_argument(
+            "--cleaner",
+            type=str_or_none,
+            choices=[None, "tacotron", "jaconv", "vietnamese"],
+            default=None,
+            help="Apply text cleaning",
+        )
+        parser.add_argument(
+            "--g2p",
+            type=str_or_none,
+            choices=g2p_choices,
+            default=None,
+            help="Specify g2p method if --token_type=phn",
+        )
+
+        for class_choices in cls.class_choices_list:
+            # Append --<name> and --<name>_conf.
+            # e.g. --encoder and --encoder_conf
+            class_choices.add_arguments(group)
+
+    @classmethod
+    def build_collate_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Callable[
+        [Collection[Tuple[str, Dict[str, np.ndarray]]]],
+        Tuple[List[str], Dict[str, torch.Tensor]],
+    ]:
+        assert check_argument_types()
+        # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol
+        return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
+
+    @classmethod
+    def build_preprocess_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
+        assert check_argument_types()
+        if args.use_preprocessor:
+            if "st" in args.subtask_series:
+                retval = MutliTokenizerCommonPreprocessor(
+                    train=train,
+                    token_type=[args.token_type, args.src_token_type],
+                    token_list=[args.token_list, args.src_token_list],
+                    bpemodel=[args.bpemodel, args.src_bpemodel],
+                    non_linguistic_symbols=args.non_linguistic_symbols,
+                    text_cleaner=args.cleaner,
+                    g2p_type=args.g2p,
+                    # NOTE(kamo): Check attribute existence for backward compatibility
+                    rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
+                    rir_apply_prob=args.rir_apply_prob
+                    if hasattr(args, "rir_apply_prob")
+                    else 1.0,
+                    noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
+                    noise_apply_prob=args.noise_apply_prob
+                    if hasattr(args, "noise_apply_prob")
+                    else 1.0,
+                    noise_db_range=args.noise_db_range
+                    if hasattr(args, "noise_db_range")
+                    else "13_15",
+                    speech_volume_normalize=args.speech_volume_normalize
+                    if hasattr(args, "speech_volume_normalize")
+                    else None,
+                    speech_name="speech",
+                    text_name=["text", "src_text"],
+                )
+            else:
+                retval = CommonPreprocessor_multi(
+                    train=train,
+                    token_type=args.token_type,
+                    token_list=args.token_list,
+                    bpemodel=args.bpemodel,
+                    non_linguistic_symbols=args.non_linguistic_symbols,
+                    text_name=["text"],
+                    text_cleaner=args.cleaner,
+                    g2p_type=args.g2p,
+                )
+        else:
+            retval = None
+        assert check_return_type(retval)
+        return retval
+
+    @classmethod
+    def required_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        if not inference:
+            retval = ("speech", "speech_ref1", "text")
+        else:
+            # Recognition mode
+            retval = ("speech",)
+        return retval
+
+    @classmethod
+    def optional_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        retval = ["dereverb_ref1"]
+        retval += ["speech_ref{}".format(n) for n in range(2, MAX_REFERENCE_NUM + 1)]
+        retval += ["noise_ref{}".format(n) for n in range(1, MAX_REFERENCE_NUM + 1)]
+        retval += ["src_text"]
+        retval = tuple(retval)
+        assert check_return_type(retval)
+        return retval
+
+    @classmethod
+    def build_model(cls, args: argparse.Namespace) -> ESPnetEnhS2TModel:
+        assert check_argument_types()
+
+        # Build submodels in the order of subtask_series
+        model_conf = args.model_conf.copy()
+        for _, subtask in enumerate(args.subtask_series):
+            subtask_conf = dict(
+                init=None, model_conf=eval(f"args.{subtask}_model_conf")
+            )
+
+            for attr in eval(f"{subtask}_attributes"):
+                subtask_conf[attr] = (
+                    getattr(args, subtask + "_" + attr, None)
+                    if getattr(args, subtask + "_" + attr, None) is not None
+                    else getattr(args, attr, None)
+                )
+
+            if subtask in ["asr", "st"]:
+                m_subtask = "s2t"
+            elif subtask in ["enh"]:
+                m_subtask = subtask
+            else:
+                raise ValueError(f"{subtask} not supported.")
+
+            logging.info(f"Building {subtask} task model, using config: {subtask_conf}")
+
+            model_conf[f"{m_subtask}_model"] = name2task[subtask].build_model(
+                argparse.Namespace(**subtask_conf)
+            )
+
+        # 8. Build model
+        model = ESPnetEnhS2TModel(**model_conf)
+
+        # FIXME(kamo): Should be done in model?
+        # 9. Initialize
+        if args.init is not None:
+            initialize(model, args.init)
+
+        assert check_return_type(model)
+        return model
diff --git a/espnet2/train/trainer.py b/espnet2/train/trainer.py
index 766651ddbaa..304d3329264 100644
--- a/espnet2/train/trainer.py
+++ b/espnet2/train/trainer.py
@@ -502,7 +502,7 @@ def train_one_epoch(
         iterator_stop = torch.tensor(0).to("cuda" if ngpu > 0 else "cpu")
 
         start_time = time.perf_counter()
-        for iiter, (_, batch) in enumerate(
+        for iiter, (utt_id, batch) in enumerate(
             reporter.measure_iter_time(iterator, "iter_time"), 1
         ):
             assert isinstance(batch, dict), type(batch)
@@ -512,6 +512,8 @@ def train_one_epoch(
                 if iterator_stop > 0:
                     break
 
+            batch["utt_id"] = utt_id
+
             batch = to_device(batch, "cuda" if ngpu > 0 else "cpu")
             if no_forward_run:
                 all_steps_are_invalid = False
@@ -705,13 +707,15 @@ def validate_one_epoch(
         # [For distributed] Because iteration counts are not always equals between
         # processes, send stop-flag to the other processes if iterator is finished
         iterator_stop = torch.tensor(0).to("cuda" if ngpu > 0 else "cpu")
-        for (_, batch) in iterator:
+        for (utt_id, batch) in iterator:
             assert isinstance(batch, dict), type(batch)
             if distributed:
                 torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM)
                 if iterator_stop > 0:
                     break
 
+            batch["utt_id"] = utt_id
+
             batch = to_device(batch, "cuda" if ngpu > 0 else "cpu")
             if no_forward_run:
                 continue
@@ -763,6 +767,9 @@ def plot_attention(
                 len(next(iter(batch.values()))),
                 len(ids),
             )
+
+            batch["utt_id"] = ids
+
             batch = to_device(batch, "cuda" if ngpu > 0 else "cpu")
             if no_forward_run:
                 continue
diff --git a/espnet2/tts/espnet_model.py b/espnet2/tts/espnet_model.py
index 986c7d029a0..e09c4a35a55 100644
--- a/espnet2/tts/espnet_model.py
+++ b/espnet2/tts/espnet_model.py
@@ -67,6 +67,7 @@ def forward(
         spembs: Optional[torch.Tensor] = None,
         sids: Optional[torch.Tensor] = None,
         lids: Optional[torch.Tensor] = None,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Caclualte outputs and return the loss tensor.
 
@@ -84,6 +85,7 @@ def forward(
             spembs (Optional[Tensor]): Speaker embedding tensor (B, D).
             sids (Optional[Tensor]): Speaker ID tensor (B, 1).
             lids (Optional[Tensor]): Language ID tensor (B, 1).
+            kwargs: "utt_id" is among the input.
 
         Returns:
             Tensor: Loss scalar tensor.
@@ -166,6 +168,7 @@ def collect_feats(
         spembs: Optional[torch.Tensor] = None,
         sids: Optional[torch.Tensor] = None,
         lids: Optional[torch.Tensor] = None,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         """Caclualte features and return them as a dict.
 
diff --git a/test/espnet2/asr/frontend/test_s3prl.py b/test/espnet2/asr/frontend/test_s3prl.py
index 77564a21a91..0bfebb823b3 100644
--- a/test/espnet2/asr/frontend/test_s3prl.py
+++ b/test/espnet2/asr/frontend/test_s3prl.py
@@ -1,12 +1,22 @@
 from distutils.version import LooseVersion
-import os
 
 import torch
 
+from espnet2.asr.frontend.s3prl import S3prlFrontend
+
 is_torch_1_7_plus = LooseVersion(torch.__version__) >= LooseVersion("1.7.0")
 
-if is_torch_1_7_plus:
-    from s3prl.upstream.interfaces import Featurizer
+
+def test_frontend_init():
+    if not is_torch_1_7_plus:
+        return
+
+    frontend = S3prlFrontend(
+        fs=16000,
+        frontend_conf=dict(upstream="mel"),
+    )
+    assert frontend.frontend_type == "s3prl"
+    assert frontend.output_dim > 0
 
 
 def test_frontend_output_size():
@@ -14,28 +24,28 @@ def test_frontend_output_size():
     if not is_torch_1_7_plus:
         return
 
-    s3prl_path = None
-    python_path_list = os.environ.get("PYTHONPATH", "(None)").split(":")
-    for p in python_path_list:
-        if p.endswith("s3prl"):
-            s3prl_path = p
-            break
-    assert s3prl_path is not None
-
-    s3prl_upstream = torch.hub.load(
-        s3prl_path,
-        "mel",
-        source="local",
-    ).to("cpu")
-
-    feature_selection = "last_hidden_state"
-    s3prl_featurizer = Featurizer(
-        upstream=s3prl_upstream,
-        feature_selection=feature_selection,
-        upstream_device="cpu",
+    frontend = S3prlFrontend(
+        fs=16000,
+        frontend_conf=dict(upstream="mel"),
+        download_dir="./hub",
     )
 
-    wavs = [torch.randn(1600)]
-    feats = s3prl_upstream(wavs)
-    feats = s3prl_featurizer(wavs, feats)
-    assert feats[0].shape[-1] == 80
+    wavs = torch.randn(2, 1600)
+    lengths = torch.LongTensor([1600, 1600])
+    feats, _ = frontend(wavs, lengths)
+    assert feats.shape[-1] == frontend.output_dim
+
+
+def test_frontend_backward():
+    if not is_torch_1_7_plus:
+        return
+
+    frontend = S3prlFrontend(
+        fs=16000,
+        frontend_conf=dict(upstream="mel"),
+        download_dir="./hub",
+    )
+    wavs = torch.randn(2, 1600, requires_grad=True)
+    lengths = torch.LongTensor([1600, 1600])
+    feats, f_lengths = frontend(wavs, lengths)
+    feats.sum().backward()
diff --git a/test/espnet2/bin/test_asr_inference.py b/test/espnet2/bin/test_asr_inference.py
index 9cd9b2a232f..43bc230dcf6 100644
--- a/test/espnet2/bin/test_asr_inference.py
+++ b/test/espnet2/bin/test_asr_inference.py
@@ -10,6 +10,7 @@
 from espnet2.bin.asr_inference import main
 from espnet2.bin.asr_inference import Speech2Text
 from espnet2.tasks.asr import ASRTask
+from espnet2.tasks.enh_s2t import EnhS2TTask
 from espnet2.tasks.lm import LMTask
 
 
@@ -118,3 +119,38 @@ def test_Speech2Text_streaming(asr_config_file_streaming, lm_config_file):
         assert isinstance(token[0], str)
         assert isinstance(token_int[0], int)
         assert isinstance(hyp, Hypothesis)
+
+
+@pytest.fixture()
+def enh_asr_config_file(tmp_path: Path, token_list):
+    # Write default configuration file
+    EnhS2TTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "enh_asr"),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+        ]
+    )
+    return tmp_path / "enh_asr" / "config.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+def test_EnhS2T_Speech2Text(enh_asr_config_file, lm_config_file):
+    speech2text = Speech2Text(
+        asr_train_config=enh_asr_config_file,
+        lm_train_config=lm_config_file,
+        beam_size=1,
+        enh_s2t_task=True,
+    )
+    speech = np.random.randn(100000)
+    results = speech2text(speech)
+    for text, token, token_int, hyp in results:
+        assert isinstance(text, str)
+        assert isinstance(token[0], str)
+        assert isinstance(token_int[0], int)
+        assert isinstance(hyp, Hypothesis)
diff --git a/test/espnet2/bin/test_enh_inference.py b/test/espnet2/bin/test_enh_inference.py
index 5150e25823e..898e44a4956 100644
--- a/test/espnet2/bin/test_enh_inference.py
+++ b/test/espnet2/bin/test_enh_inference.py
@@ -1,5 +1,6 @@
 from argparse import ArgumentParser
 from pathlib import Path
+import string
 
 import pytest
 import torch
@@ -8,6 +9,7 @@
 from espnet2.bin.enh_inference import main
 from espnet2.bin.enh_inference import SeparateSpeech
 from espnet2.tasks.enh import EnhancementTask
+from espnet2.tasks.enh_s2t import EnhS2TTask
 
 
 def test_get_parser():
@@ -27,10 +29,10 @@ def config_file(tmp_path: Path):
             "--dry_run",
             "true",
             "--output_dir",
-            str(tmp_path),
+            str(tmp_path / "enh"),
         ]
     )
-    return tmp_path / "config.yaml"
+    return tmp_path / "enh" / "config.yaml"
 
 
 @pytest.mark.execution_timeout(5)
@@ -50,3 +52,57 @@ def test_SeparateSpeech(
     )
     wav = torch.rand(batch_size, input_size)
     separate_speech(wav, fs=8000)
+
+
+@pytest.fixture()
+def token_list(tmp_path: Path):
+    with (tmp_path / "tokens.txt").open("w") as f:
+        f.write("<blank>\n")
+        for c in string.ascii_letters:
+            f.write(f"{c}\n")
+        f.write("<unk>\n")
+        f.write("<sos/eos>\n")
+    return tmp_path / "tokens.txt"
+
+
+@pytest.fixture()
+def enh_s2t_config_file(tmp_path: Path, token_list):
+    # Write default configuration file
+    EnhS2TTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "enh_s2t"),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+        ]
+    )
+    return tmp_path / "enh_s2t" / "config.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize(
+    "input_size, segment_size, hop_size, normalize_segment_scale",
+    [(16000, None, None, False), (35000, 2.4, 0.8, False), (35000, 2.4, 0.8, True)],
+)
+def test_enh_s2t_SeparateSpeech(
+    enh_s2t_config_file,
+    batch_size,
+    input_size,
+    segment_size,
+    hop_size,
+    normalize_segment_scale,
+):
+    separate_speech = SeparateSpeech(
+        train_config=enh_s2t_config_file,
+        segment_size=segment_size,
+        hop_size=hop_size,
+        normalize_segment_scale=normalize_segment_scale,
+        enh_s2t_task=True,
+    )
+    wav = torch.rand(batch_size, input_size)
+    separate_speech(wav, fs=8000)
diff --git a/test/espnet2/bin/test_enh_s2t_train.py b/test/espnet2/bin/test_enh_s2t_train.py
new file mode 100644
index 00000000000..2cd4fe6f94f
--- /dev/null
+++ b/test/espnet2/bin/test_enh_s2t_train.py
@@ -0,0 +1,15 @@
+from argparse import ArgumentParser
+
+import pytest
+
+from espnet2.bin.enh_s2t_train import get_parser
+from espnet2.bin.enh_s2t_train import main
+
+
+def test_get_parser():
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    with pytest.raises(SystemExit):
+        main()
diff --git a/test/espnet2/enh/test_espnet_enh_s2t_model.py b/test/espnet2/enh/test_espnet_enh_s2t_model.py
new file mode 100644
index 00000000000..5f7df398130
--- /dev/null
+++ b/test/espnet2/enh/test_espnet_enh_s2t_model.py
@@ -0,0 +1,129 @@
+import pytest
+import torch
+
+from espnet2.asr.ctc import CTC
+from espnet2.asr.decoder.transformer_decoder import TransformerDecoder
+from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
+from espnet2.asr.espnet_model import ESPnetASRModel
+from espnet2.asr.frontend.default import DefaultFrontend
+from espnet2.enh.decoder.stft_decoder import STFTDecoder
+from espnet2.enh.encoder.stft_encoder import STFTEncoder
+from espnet2.enh.espnet_enh_s2t_model import ESPnetEnhS2TModel
+from espnet2.enh.espnet_model import ESPnetEnhancementModel
+from espnet2.enh.loss.criterions.time_domain import SISNRLoss
+from espnet2.enh.loss.wrappers.fixed_order import FixedOrderSolver
+from espnet2.enh.separator.rnn_separator import RNNSeparator
+
+
+enh_stft_encoder = STFTEncoder(
+    n_fft=32,
+    hop_length=16,
+)
+
+enh_stft_decoder = STFTDecoder(
+    n_fft=32,
+    hop_length=16,
+)
+
+enh_rnn_separator = RNNSeparator(
+    input_dim=17,
+    layer=1,
+    unit=10,
+    num_spk=1,
+)
+
+si_snr_loss = SISNRLoss()
+
+fix_order_solver = FixedOrderSolver(criterion=si_snr_loss)
+
+default_frontend = DefaultFrontend(
+    fs=300,
+    n_fft=32,
+    win_length=32,
+    hop_length=24,
+    n_mels=32,
+)
+
+token_list = ["<blank>", "<space>", "a", "e", "i", "o", "u", "<sos/eos>"]
+
+asr_transformer_encoder = TransformerEncoder(
+    32,
+    output_size=16,
+    linear_units=16,
+    num_blocks=2,
+)
+
+asr_transformer_decoder = TransformerDecoder(
+    len(token_list),
+    16,
+    linear_units=16,
+    num_blocks=2,
+)
+
+asr_ctc = CTC(odim=len(token_list), encoder_output_size=16)
+
+
+@pytest.mark.parametrize(
+    "enh_encoder, enh_decoder",
+    [(enh_stft_encoder, enh_stft_decoder)],
+)
+@pytest.mark.parametrize("enh_separator", [enh_rnn_separator])
+@pytest.mark.parametrize("training", [True, False])
+@pytest.mark.parametrize("loss_wrappers", [[fix_order_solver]])
+@pytest.mark.parametrize("frontend", [default_frontend])
+@pytest.mark.parametrize("s2t_encoder", [asr_transformer_encoder])
+@pytest.mark.parametrize("s2t_decoder", [asr_transformer_decoder])
+@pytest.mark.parametrize("s2t_ctc", [asr_ctc])
+def test_enh_asr_model(
+    enh_encoder,
+    enh_decoder,
+    enh_separator,
+    training,
+    loss_wrappers,
+    frontend,
+    s2t_encoder,
+    s2t_decoder,
+    s2t_ctc,
+):
+    inputs = torch.randn(2, 300)
+    ilens = torch.LongTensor([300, 200])
+    speech_ref = torch.randn(2, 300).float()
+    text = torch.LongTensor([[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]])
+    text_lengths = torch.LongTensor([5, 5])
+    enh_model = ESPnetEnhancementModel(
+        encoder=enh_encoder,
+        separator=enh_separator,
+        decoder=enh_decoder,
+        loss_wrappers=loss_wrappers,
+    )
+    s2t_model = ESPnetASRModel(
+        vocab_size=len(token_list),
+        token_list=token_list,
+        frontend=frontend,
+        encoder=s2t_encoder,
+        decoder=s2t_decoder,
+        ctc=s2t_ctc,
+        specaug=None,
+        normalize=None,
+        preencoder=None,
+        postencoder=None,
+        joint_network=None,
+    )
+    enh_s2t_model = ESPnetEnhS2TModel(
+        enh_model=enh_model,
+        s2t_model=s2t_model,
+    )
+
+    if training:
+        enh_s2t_model.train()
+    else:
+        enh_s2t_model.eval()
+
+    kwargs = {
+        "speech": inputs,
+        "speech_lengths": ilens,
+        "speech_ref1": speech_ref,
+        "text": text,
+        "text_lengths": text_lengths,
+    }
+    loss, stats, weight = enh_s2t_model(**kwargs)
diff --git a/test/espnet2/tasks/test_abs_task.py b/test/espnet2/tasks/test_abs_task.py
index b03e35b29f3..7a9297f78e2 100644
--- a/test/espnet2/tasks/test_abs_task.py
+++ b/test/espnet2/tasks/test_abs_task.py
@@ -17,7 +17,7 @@ def __init__(self):
     def collect_feats(self):
         return {}
 
-    def forward(self, x, x_lengths):
+    def forward(self, x, x_lengths, **kwargs):
         x = self.layer1(x)
         x = self.layer2(x)
         retval = {
diff --git a/test/espnet2/tasks/test_enh_s2t.py b/test/espnet2/tasks/test_enh_s2t.py
new file mode 100644
index 00000000000..1d4622d21d5
--- /dev/null
+++ b/test/espnet2/tasks/test_enh_s2t.py
@@ -0,0 +1,36 @@
+import pytest
+
+from espnet2.tasks.enh_s2t import EnhS2TTask
+
+
+def test_add_arguments():
+    EnhS2TTask.get_parser()
+
+
+def test_add_arguments_help():
+    parser = EnhS2TTask.get_parser()
+    with pytest.raises(SystemExit):
+        parser.parse_args(["--help"])
+
+
+def test_main_help():
+    with pytest.raises(SystemExit):
+        EnhS2TTask.main(cmd=["--help"])
+
+
+def test_main_print_config():
+    with pytest.raises(SystemExit):
+        EnhS2TTask.main(cmd=["--print_config"])
+
+
+def test_main_with_no_args():
+    with pytest.raises(SystemExit):
+        EnhS2TTask.main(cmd=[])
+
+
+def test_print_config_and_load_it(tmp_path):
+    config_file = tmp_path / "config.yaml"
+    with config_file.open("w") as f:
+        EnhS2TTask.print_config(f)
+    parser = EnhS2TTask.get_parser()
+    parser.parse_args(["--config", str(config_file)])