diff --git a/README.md b/README.md
index b02a8b9212d..7bae1d28434 100644
--- a/README.md
+++ b/README.md
@@ -35,7 +35,7 @@ ESPnet uses [pytorch](http://pytorch.org/) as a deep learning engine and also fo
 - Support numbers of `ASR` recipes (WSJ, Switchboard, CHiME-4/5, Librispeech, TED, CSJ, AMI, HKUST, Voxforge, REVERB, etc.)
 - Support numbers of `TTS` recipes with a similar manner to the ASR recipe (LJSpeech, LibriTTS, M-AILABS, etc.)
 - Support numbers of `ST` recipes (Fisher-CallHome Spanish, Libri-trans, IWSLT'18, How2, Must-C, Mboshi-French, etc.)
-- Support numbers of `MT` recipes (IWSLT'16, the above ST recipes etc.)
+- Support numbers of `MT` recipes (IWSLT'14, IWSLT'16, the above ST recipes etc.)
 - Support numbers of `SLU` recipes (CATSLU-MAPS, FSC, Grabo, IEMOCAP, JDCINAL, SNIPS, SLURP, SWBD-DA, etc.)
 - Support numbers of `SE/SS` recipes (DNS-IS2020, LibriMix, SMS-WSJ, VCTK-noisyreverb, WHAM!, WHAMR!, WSJ-2mix, etc.)
 - Support voice conversion recipe (VCC2020 baseline)
@@ -368,6 +368,7 @@ Available pretrained models in the demo script are listed as below.
 | Must-C tst-COMMON (En->De)                        | 27.63 |                               [link](https://github.com/espnet/espnet/blob/master/egs/must_c/mt1/RESULTS.md#summary-4-gram-bleu)                                |
 | IWSLT'14 test2014 (En->De)                        | 24.70 |                                     [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result)                                      |
 | IWSLT'14 test2014 (De->En)                        | 29.22 |                                     [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result)                                      |
+| IWSLT'14 test2014 (De->En)                        | 32.2  | [link](https://github.com/espnet/espnet/blob/master/egs2/iwslt14/mt1/README.md)  |
 | IWSLT'16 test2014 (En->De)                        | 24.05 |                                     [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result)                                      |
 | IWSLT'16 test2014 (De->En)                        | 29.13 |                                     [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result)                                      |
 
diff --git a/egs2/README.md b/egs2/README.md
index d03884f2ceb..dcbd80bf5b9 100755
--- a/egs2/README.md
+++ b/egs2/README.md
@@ -75,6 +75,7 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | su_openslr36            | Sundanese                                                                               | ASR                     | SUN                  | http://www.openslr.org/36                                                                                    |              |
 | swbd                    | Switchboard Corpus for 2-channel Conversational Telephone Speech (300h)                 | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC97S62                                                                       |              |
 | swbd_da                 | NXT Switchboard Annotations                                                             | SLU                     | ENG                  | https://catalog.ldc.upenn.edu/LDC2009T26                                                                     |              |
+| swbd_sentiment          | Speech Sentiment Annotations                                                            | SLU                     | ENG                  | https://catalog.ldc.upenn.edu/LDC2020T14                                                                    |              |
 | tedlium2                | TED-LIUM corpus release 2                                                               | ASR                     | ENG                  | https://www.openslr.org/19/, http://www.lrec-conf.org/proceedings/lrec2014/pdf/1104_Paper.pdf                |              |
 | thchs30                 | A Free Chinese Speech Corpus Released by CSLT@Tsinghua University                       | TTS                     | CMN                  | https://www.openslr.org/18/                                                                                  |              |
 | timit                   | TIMIT Acoustic-Phonetic Continuous Speech Corpus                                        | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC93S1                                                                        |              |
diff --git a/egs2/TEMPLATE/mt1/mt.sh b/egs2/TEMPLATE/mt1/mt.sh
index cbfaf059d41..6164c155558 100755
--- a/egs2/TEMPLATE/mt1/mt.sh
+++ b/egs2/TEMPLATE/mt1/mt.sh
@@ -460,7 +460,7 @@ if ! "${skip_data_prep}"; then
 
     if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         if [ "${feats_type}" = raw ]; then
-            log "Stage 3: data/ -> ${data_feats}"
+            log "Stage 2: data/ -> ${data_feats}"
 
             for dset in "${train_set}" "${valid_set}" ${test_sets}; do
                 if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
@@ -508,19 +508,18 @@ if ! "${skip_data_prep}"; then
 
     if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
 
-        # Then generate src lang
         if "${token_joint}"; then
             log "Merge src and target data if joint BPE"
 
             cat $tgt_bpe_train_text > ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
-            [ -z "${src_bpe_train_text}" ] && cat ${src_bpe_train_text} >>  ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            [ ! -z "${src_bpe_train_text}" ] && cat ${src_bpe_train_text} >> ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
             # Set the new text as the target text
             tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}"
         fi
 
         # First generate tgt lang
         if [ "${tgt_token_type}" = bpe ]; then
-            log "Stage 5a: Generate token_list from ${tgt_bpe_train_text} using BPE for tgt_lang"
+            log "Stage 4a: Generate token_list from ${tgt_bpe_train_text} using BPE for tgt_lang"
 
             mkdir -p "${tgt_bpedir}"
             # shellcheck disable=SC2002
@@ -550,7 +549,7 @@ if ! "${skip_data_prep}"; then
             } > "${tgt_token_list}"
 
         elif [ "${tgt_token_type}" = char ] || [ "${tgt_token_type}" = word ]; then
-            log "Stage 5a: Generate character level token_list from ${tgt_bpe_train_text}  for tgt_lang"
+            log "Stage 4a: Generate character level token_list from ${tgt_bpe_train_text}  for tgt_lang"
 
             _opts="--non_linguistic_symbols ${nlsyms_txt}"
 
@@ -593,10 +592,10 @@ if ! "${skip_data_prep}"; then
 
         # Then generate src lang
         if "${token_joint}"; then
-            log "Stage 5b: Skip separate token construction for src_lang when setting ${token_joint} as true"
+            log "Stage 4b: Skip separate token construction for src_lang when setting ${token_joint} as true"
         else
             if [ "${src_token_type}" = bpe ]; then
-                log "Stage 5b: Generate token_list from ${src_bpe_train_text} using BPE for src_lang"
+                log "Stage 4b: Generate token_list from ${src_bpe_train_text} using BPE for src_lang"
 
                 mkdir -p "${src_bpedir}"
                 # shellcheck disable=SC2002
@@ -626,7 +625,7 @@ if ! "${skip_data_prep}"; then
                 } > "${src_token_list}"
 
             elif [ "${src_token_type}" = char ] || [ "${src_token_type}" = word ]; then
-                log "Stage 5b: Generate character level token_list from ${src_bpe_train_text}  for src_lang"
+                log "Stage 4b: Generate character level token_list from ${src_bpe_train_text}  for src_lang"
 
                 _opts="--non_linguistic_symbols ${nlsyms_txt}"
 
@@ -650,8 +649,6 @@ if ! "${skip_data_prep}"; then
                 log "Error: not supported --token_type '${src_token_type}'"
                 exit 2
             fi
-
-
         fi
     fi
 
diff --git a/egs2/iwslt14/mt1/README.md b/egs2/iwslt14/mt1/README.md
new file mode 100644
index 00000000000..de18c222268
--- /dev/null
+++ b/egs2/iwslt14/mt1/README.md
@@ -0,0 +1,14 @@
+# Results
+
+## mt_train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3_raw_bpe_tc10000
+- mt_config: conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
+- inference_config: conf/decode_mt.yaml
+
+### BLEU
+
+Metric: BLEU-4, detokenized case-sensitive BLEU result (single-reference)
+
+|dataset|bleu_score|verbose_score|
+|---|---|---|
+|beam5_maxlenratio1.6_penalty0.2/valid|33.3|68.4/42.9/28.9/19.8 (BP = 0.924 ratio = 0.927 hyp_len = 134328 ref_len = 144976)|
+|beam5_maxlenratio1.6_penalty0.2/test|32.2|67.2/41.4/27.4/18.5 (BP = 0.933 ratio = 0.935 hyp_len = 119813 ref_len = 128122)|
diff --git a/egs2/iwslt14/mt1/conf/decode_mt.yaml b/egs2/iwslt14/mt1/conf/decode_mt.yaml
index 2967ee6fc0f..6570a89920d 100644
--- a/egs2/iwslt14/mt1/conf/decode_mt.yaml
+++ b/egs2/iwslt14/mt1/conf/decode_mt.yaml
@@ -1,5 +1,5 @@
-batch_size: 1
-beam_size: 10
-nbest: 1
+beam_size: 5
 lm_weight: 0.0
-
+maxlenratio: 1.6
+minlenratio: 0.0
+penalty: 0.2
diff --git a/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml b/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml
deleted file mode 100644
index f39b863dc70..00000000000
--- a/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
-batch_type: numel
-batch_bins: 15000000
-accum_grad: 8
-max_epoch: 35
-patience: none
-init: none
-best_model_criterion:
--   - valid
-    - acc
-    - max
-keep_nbest_models: 10
-
-encoder: transformer
-encoder_conf:
-    output_size: 256
-    attention_heads: 4
-    linear_units: 2048
-    num_blocks: 12
-    dropout_rate: 0.1
-    positional_dropout_rate: 0.1
-    attention_dropout_rate: 0.1
-    input_layer: null
-    normalize_before: true
-
-decoder: transformer
-decoder_conf:
-    attention_heads: 4
-    linear_units: 2048
-    num_blocks: 6
-    dropout_rate: 0.1
-    positional_dropout_rate: 0.1
-    self_attention_dropout_rate: 0.1
-    src_attention_dropout_rate: 0.1
-
-model_conf:
-    lsm_weight: 0.1
-    length_normalized_loss: false
-
-optim: adam
-optim_conf:
-    lr: 0.0005
-scheduler: warmuplr
-scheduler_conf:
-    warmup_steps: 25000
diff --git a/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml b/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml
new file mode 120000
index 00000000000..050cda0e4d0
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml
@@ -0,0 +1 @@
+tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml b/egs2/iwslt14/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
new file mode 100644
index 00000000000..8b2d8844238
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
@@ -0,0 +1,59 @@
+frontend: embed     # embedding + positional encoding
+frontend_conf:
+    embed_dim: 512
+    positional_dropout_rate: 0.3
+
+encoder: transformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 6
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: null
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 6
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    share_decoder_input_output_embed: false
+    share_encoder_decoder_input_embed: true
+
+num_att_plot: 1
+log_interval: 100
+num_workers: 2
+batch_type: numel
+batch_bins: 400000000
+accum_grad: 1
+max_epoch: 200
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.003
+    betas:
+    - 0.9
+    - 0.98
+    eps: 0.000000001
+    weight_decay: 0.0001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 10000
diff --git a/egs2/iwslt14/mt1/run.sh b/egs2/iwslt14/mt1/run.sh
index 06c09e91751..b8567d2709a 100755
--- a/egs2/iwslt14/mt1/run.sh
+++ b/egs2/iwslt14/mt1/run.sh
@@ -10,13 +10,13 @@ tgt_lang=en
 
 train_set=train
 train_dev=valid
-test_set="test"
+test_set="test valid"
 
 mt_config=conf/train_mt_transformer.yaml
 inference_config=conf/decode_mt.yaml
 
 src_nbpe=1000
-tgt_nbpe=1000
+tgt_nbpe=10000   # if token_joint is True, then only tgt_nbpe is used
 
 # tc: truecase
 # lc: lowercase
@@ -27,12 +27,11 @@ tgt_case=tc
 
 ./mt.sh \
     --ignore_init_mismatch true \
-    --stage 1 \
-    --stop_stage 13 \
     --use_lm false \
-    --token_joint false \
-    --nj 20 \
-    --inference_nj 20 \
+    --token_joint true \
+    --ngpu 1 \
+    --nj 16 \
+    --inference_nj 32 \
     --src_lang ${src_lang} \
     --tgt_lang ${tgt_lang} \
     --src_token_type "bpe" \
@@ -49,4 +48,4 @@ tgt_case=tc
     --test_sets "${test_set}" \
     --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \
     --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \
-    --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}"  "$@"
+    --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" "$@"
diff --git a/egs2/swbd_sentiment/asr1/README.md b/egs2/swbd_sentiment/asr1/README.md
new file mode 100644
index 00000000000..84ee7efbbf1
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/README.md
@@ -0,0 +1,35 @@
+# RESULTS
+## Dataset
+- Speech Sentiment Annotations (Switchboard Sentiment)
+   - Data: https://catalog.ldc.upenn.edu/LDC2020T14
+   - Paper: https://catalog.ldc.upenn.edu/docs/LDC2020T14/LREC_2020_Switchboard_Senti.pdf
+
+## Environments
+- date: `Thu Mar  3 21:34:18 EST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `3b53aedc654fd30a828689c2139a1e130adac077`
+  - Commit date: `Fri Feb 25 00:13:16 2022 -0500`
+
+## Using Conformer based encoder and Transformer based decoder with spectral augmentation and predicting transcript along with sentiment
+- ASR config: [conf/tuning/train_asr_conformer.yaml](conf/tuning/train_asr_conformer.yaml)
+- token_type: word
+- labels: Positive, Neutral, Negative
+- Pre-trained Model: https://huggingface.co/espnet/YushiUeda_swbd_sentiment_asr_train_asr_conformer
+
+|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)|
+|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave_10best/valid|2415|61.0|65.0|65.6|
+|decode_asr_asr_model_valid.acc.ave_10best/test|2438|61.4|64.4|64.6|
+
+## Using Conformer based encoder, Transformer based decoder and self-supervised learning features (Wav2vec2.0) with spectral augmentation and predicting transcript along with sentiment
+- ASR config: [conf/tuning/train_asr_conformer_wav2vec2.yaml](conf/tuning/train_asr_conformer_wav2vec2.yaml)
+- token_type: word
+- labels: Positive, Neutral, Negative
+- Pre-trained Model: https://huggingface.co/espnet/YushiUeda_swbd_sentiment_asr_train_asr_conformer_wav2vec2
+
+|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)|
+|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave_10best/valid|2415|64.5|67.5|67.4|
+|decode_asr_asr_model_valid.acc.ave_10best/test|2438|64.1|66.5|66.3|
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/asr.sh b/egs2/swbd_sentiment/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/cmd.sh b/egs2/swbd_sentiment/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/swbd_sentiment/asr1/conf/decode_asr.yaml b/egs2/swbd_sentiment/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..e2408c35e82
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+beam_size: 1
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/conf/fbank.conf b/egs2/swbd_sentiment/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/swbd_sentiment/asr1/conf/pbs.conf b/egs2/swbd_sentiment/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/swbd_sentiment/asr1/conf/pitch.conf b/egs2/swbd_sentiment/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/swbd_sentiment/asr1/conf/queue.conf b/egs2/swbd_sentiment/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/swbd_sentiment/asr1/conf/slurm.conf b/egs2/swbd_sentiment/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/swbd_sentiment/asr1/conf/train_asr.yaml b/egs2/swbd_sentiment/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..10fea4f0971
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer.yaml
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..f5104e2d6ea
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,62 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 40000
+batch_type: numel
+batch_bins: 40000000
+accum_grad: 3
+max_epoch: 50
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
diff --git a/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer_wav2vec2.yaml b/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer_wav2vec2.yaml
new file mode 100644
index 00000000000..92b01329911
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer_wav2vec2.yaml
@@ -0,0 +1,90 @@
+# network architecture
+# encoder related
+
+encoder: conformer
+encoder_conf:
+  output_size: 512
+  attention_heads: 8
+  linear_units: 2048
+  num_blocks: 12
+  dropout_rate: 0.1
+  positional_dropout_rate: 0.1
+  attention_dropout_rate: 0.1
+  input_layer: conv2d
+  normalize_before: true
+  macaron_style: true
+  pos_enc_layer_type: "rel_pos"
+  selfattention_layer_type: "rel_selfattn"
+  activation_type: "swish"
+  use_cnn_module: true
+  cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+  attention_heads: 8
+  linear_units: 2048
+  num_blocks: 6
+  dropout_rate: 0.1
+  positional_dropout_rate: 0.1
+  self_attention_dropout_rate: 0.1
+  src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+  lr: 0.0025
+scheduler: warmuplr   # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+  warmup_steps: 25000
+batch_type: numel
+batch_bins: 40000000
+accum_grad: 3
+max_epoch: 50
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+ n_fft: 512
+ hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+  frontend_conf:
+    upstream: wav2vec2_large_ll60k # Note: If the upstream is changed, please change the input_size in the preencoder.
+    # If using hubert, change the above line to "upstream: hubert_large_ll60k"
+  download_dir: ./hub
+  multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+  input_size: 1024 # Note: If the upstream is changed, please change this value accordingly.
+  output_size: 80
+
+model_conf:
+  ctc_weight: 0.3
+  lsm_weight: 0.1
+  length_normalized_loss: false
+  extract_feats_in_collect_stats: false  # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+  apply_time_warp: true
+  time_warp_window: 5
+  time_warp_mode: bicubic
+  apply_freq_mask: true
+  freq_mask_width_range:
+  - 0
+  - 30
+  num_freq_mask: 2
+  apply_time_mask: true
+  time_mask_width_range:
+  - 0
+  - 40
+  num_time_mask: 2
+
+best_model_criterion:
+- - valid
+  - acc
+  - max
+keep_nbest_models: 10
diff --git a/egs2/swbd_sentiment/asr1/db.sh b/egs2/swbd_sentiment/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/MSU_single_letter.txt b/egs2/swbd_sentiment/asr1/local/MSU_single_letter.txt
new file mode 120000
index 00000000000..dd1bbcd661f
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/MSU_single_letter.txt
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/MSU_single_letter.txt
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/data.sh b/egs2/swbd_sentiment/asr1/local/data.sh
new file mode 100755
index 00000000000..54df2f8eb97
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/data.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=4
+
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${SWBD}" ]; then
+    log "Fill the value of 'SWBD' of db.sh"
+    exit 1
+fi
+
+
+# we assume that LDC97S62 & speech_sentiment_annotations are placed under SWBD
+swbd1_dir=${SWBD}/LDC97S62
+swbd_sentiment=${SWBD}/speech_sentiment_annotations/data/sentiment_labels.tsv
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log " Data Preparation"
+    local/swbd1_data_download.sh ${swbd1_dir}
+    local/swbd1_prepare_dict.sh
+    local/swbd1_data_prep.sh ${swbd1_dir}
+    # upsample audio from 8k to 16k to make a recipe consistent with others
+    sed -i.bak -e "s/$/ sox -R -t wav - -t wav - rate 16000 dither | /" data/train/wav.scp
+    utils/fix_data_dir.sh data/train
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log " Data Formatting"
+     # remove ._ . _1 symbols from text  
+     cp data/train/text data/train/text.backup
+     sed -i 's/\._/ /g; s/\.//g; s/them_1/them/g' data/train/text
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log " Concatenate Sentiment with Transcription"
+    # Concatenate sentiment (Positive, Negative, Neutral) with transcription. 
+    # Using sentiment annotation reconciliation strategy based on majority voting as in
+    # https://catalog.ldc.upenn.edu/docs/LDC2020T14/LREC_2020_Switchboard_Senti.pdf
+    # This stage may take a while
+    mkdir -p data/local/tmp/
+    mv -f data/train/* data/local/tmp/.
+    mkdir -p data/dev/
+    mkdir -p data/test/
+    python3 local/prepare_sentiment.py \
+        --train_dir data/train/ \
+        --dev_dir data/dev/ \
+        --test_dir data/test/ \
+        --sentiment_file ${swbd_sentiment} \
+        --text_file data/local/tmp/text \
+        --wavscp_file data/local/tmp/wav.scp
+    for dir in train dev test; do
+    utils/utt2spk_to_spk2utt.pl data/$dir/utt2spk > data/$dir/spk2utt
+    utils/fix_data_dir.sh data/$dir
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/swbd_sentiment/asr1/local/dict.patch b/egs2/swbd_sentiment/asr1/local/dict.patch
new file mode 120000
index 00000000000..f3e0d14e91a
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/dict.patch
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/dict.patch
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/extend_segments.pl b/egs2/swbd_sentiment/asr1/local/extend_segments.pl
new file mode 120000
index 00000000000..63065555357
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/extend_segments.pl
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/extend_segments.pl
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/format_acronyms_dict.py b/egs2/swbd_sentiment/asr1/local/format_acronyms_dict.py
new file mode 120000
index 00000000000..29f6093eade
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/format_acronyms_dict.py
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/format_acronyms_dict.py
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/map_acronyms_transcripts.py b/egs2/swbd_sentiment/asr1/local/map_acronyms_transcripts.py
new file mode 120000
index 00000000000..1d1cda1cc86
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/map_acronyms_transcripts.py
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/map_acronyms_transcripts.py
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/path.sh b/egs2/swbd_sentiment/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/swbd_sentiment/asr1/local/prepare_sentiment.py b/egs2/swbd_sentiment/asr1/local/prepare_sentiment.py
new file mode 100755
index 00000000000..8921fa4272d
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/prepare_sentiment.py
@@ -0,0 +1,174 @@
+import os
+import re
+import argparse
+import math
+
+
+def float2str(number, size=6):
+    number = str(math.ceil(number * 100))
+    return (size - len(number)) * "0" + number
+
+
+def majorityvote(line):
+    count_pos = line.count("Positive")
+    count_neu = line.count("Neutral")
+    count_neg = line.count("Negative")
+    dic = {"Positive": count_pos, "Neutral": count_neu, "Negative": count_neg}
+    max_value = max(dic.values())
+    # make sure max_value is unique
+    keys = [key for key, value in dic.items() if value == max_value]
+    label = keys[0] if len(keys) == 1 else -1
+    return label
+
+
+def normalize_transcript(transcript):
+    # remove punctuation except apostrophes
+    transcript = re.sub(r"(\.|\,|\?|\!|\-|\:|\;)", " \\1 ", transcript)
+    transcript = re.sub(r"\.|\,|\?|\!|\-|\:|\;", "", transcript)
+    # remove tag (e.g. [LAUGHTER])
+    transcript = re.sub(r"\[.+\]", "", transcript)
+    # Detect valid apostrophe cases and split those into two words
+    transcript = re.sub("([a-z])'([a-z])", "\\1 '\\2", transcript)
+    # Clean up special cases of standalone apostrophes
+    transcript = re.sub("([a-z])' ", "\\1 ", transcript)
+    # remove extra spaces
+    transcript = re.sub(" +", " ", transcript)
+    # remove space at the beginning of the utterance
+    transcript = re.sub("^ ", "", transcript)
+    return transcript
+
+
+def process_data(
+    target_dir, sentiment_file, text_file, wavscp_file, start_linenum, end_linenum
+):
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+
+    utt2spk_list = []
+    segments_list = []
+    text_list = []
+    wavscp_list = []
+    reco2file_list = []
+
+    with open(sentiment_file, "r", encoding="utf-8") as sf, open(
+        text_file, "r", encoding="utf-8"
+    ) as tf, open(wavscp_file, "r", encoding="utf-8") as wf:
+        prev_spk_id_tf = 0
+        prev_linenum_tf = 0
+        prev_linenum_wf = 0
+        for linenum, line_sf in enumerate(sf):
+            if linenum >= start_linenum and linenum < end_linenum:
+                # "sw02005_0[tab]0.0[tab]11.287375[tab]
+                # Neutral-{Questioning}#Neutral-{No emotion}#Neutral-{No emotion}"
+                utt_id_sf, start, end, sentiment = line_sf.strip().split("\t")
+                # "sw02005_0" -> "sw02005"
+                reco_id_sf = utt_id_sf.split("_")[0]
+                label = majorityvote(sentiment)
+                if label != -1:
+                    tf.seek(0)
+                    for linenum_tf, line_tf in enumerate(tf):
+                        if linenum_tf >= prev_linenum_tf:
+                            # "sw02001-A_018732-018950 oh i see uh-huh"
+                            # -> "sw02001-A_018732-018950" "oh i see uh-huh"
+                            utt_id_tf, transcript = line_tf.strip("\n").split(" ", 1)
+                            # "sw02001-A_018732-018950" -> "sw02001-A" "018732-018950"
+                            spk_id_tf, time_id = utt_id_tf.split("_")
+                            # "sw02001-A" -> "sw02001"
+                            reco_id_tf = spk_id_tf.split("-")[0]
+                            # "018732-018950" -> "018732" "018950"
+                            start_time_id, end_time_id = time_id.split("-")
+                            # in case start and end time slightly differ
+                            # in text and sentiment annotation
+                            eps = 0.05
+                            if (
+                                reco_id_tf == reco_id_sf
+                                and start_time_id >= float2str(float(start) - eps)
+                                and start_time_id <= float2str(float(start) + eps)
+                                and end_time_id >= float2str(float(end) - eps)
+                                and end_time_id <= float2str(float(end) + eps)
+                            ):
+                                # normalize transcript
+                                transcript = normalize_transcript(transcript)
+                                utt2spk_list.append(
+                                    "{} {}".format(utt_id_tf, spk_id_tf)
+                                )
+                                segments_list.append(
+                                    "{} {} {:.2f} {:.2f}".format(
+                                        utt_id_tf, spk_id_tf, float(start), float(end)
+                                    )
+                                )
+                                text_list.append(
+                                    "{} {} {}".format(utt_id_tf, label, transcript)
+                                )
+
+                                if prev_spk_id_tf != spk_id_tf:
+                                    wf.seek(0)
+                                    for linenum_wf, line_wf in enumerate(wf):
+                                        if linenum_wf >= prev_linenum_wf:
+                                            spk_id_wf = line_wf.split(" ")[0]
+                                            if spk_id_wf == spk_id_tf:
+                                                wavscp_list.append(
+                                                    "{}".format(line_wf.strip("\n"))
+                                                )
+                                                (
+                                                    reco_id_wf,
+                                                    channel_id,
+                                                ) = spk_id_wf.split("-")
+                                                reco2file_list.append(
+                                                    "{} {} {}".format(
+                                                        spk_id_wf,
+                                                        reco_id_wf,
+                                                        channel_id,
+                                                    )
+                                                )
+                                                prev_linenum_wf = linenum_wf
+                                                break
+                                prev_spk_id_tf = spk_id_tf
+                                prev_linenum_tf = linenum_tf
+                                break
+    with open(
+        os.path.join(target_dir, "utt2spk"), "w", encoding="utf-8"
+    ) as utt2spk, open(
+        os.path.join(target_dir, "segments"), "w", encoding="utf-8"
+    ) as segments, open(
+        os.path.join(target_dir, "text"), "w", encoding="utf-8"
+    ) as text, open(
+        os.path.join(target_dir, "wav.scp"), "w", encoding="utf-8"
+    ) as wavscp, open(
+        os.path.join(target_dir, "reco2file_and_channel"), "w", encoding="utf-8"
+    ) as reco2file:
+        utt2spk.write("\n".join(utt2spk_list) + "\n")
+        segments.write("\n".join(segments_list) + "\n")
+        text.write("\n".join(text_list) + "\n")
+        wavscp.write("\n".join(wavscp_list) + "\n")
+        reco2file.write("\n".join(reco2file_list) + "\n")
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--train_dir", type=str, default="data/train")
+parser.add_argument("--dev_dir", type=str, default="data/dev")
+parser.add_argument("--test_dir", type=str, default="data/test")
+parser.add_argument("--sentiment_file", type=str, required=True)
+parser.add_argument("--text_file", type=str, default="data/train/tmp/text")
+parser.add_argument("--wavscp_file", type=str, default="data/train/tmp/wav.scp")
+
+args = parser.parse_args()
+
+# Split into train, dev, test
+# Note that there is no "official" split provided.
+# Using the proportion of train 90%, dev 5%, test 5% as in
+# https://arxiv.org/pdf/1911.09762.pdf
+print("start train file preparation...this may take a while")
+process_data(
+    args.train_dir, args.sentiment_file, args.text_file, args.wavscp_file, 0, 47056
+)
+print("start dev file preparation")
+process_data(
+    args.dev_dir, args.sentiment_file, args.text_file, args.wavscp_file, 47056, 49673
+)
+print("start test file preparation")
+process_data(
+    args.test_dir, args.sentiment_file, args.text_file, args.wavscp_file, 49673, 52293
+)
+
+print("Successfully finished text, utt2spk, segments, wavescp preparation")
diff --git a/egs2/swbd_sentiment/asr1/local/score.py b/egs2/swbd_sentiment/asr1/local/score.py
new file mode 120000
index 00000000000..32c1318cf74
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/score.py
@@ -0,0 +1 @@
+../../../TEMPLATE/asr1/pyscripts/utils/score_intent.py
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/score.sh b/egs2/swbd_sentiment/asr1/local/score.sh
new file mode 120000
index 00000000000..938c01f1250
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/score.sh
@@ -0,0 +1 @@
+../../../slue-voxceleb/asr1/local/score.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/score_f1.py b/egs2/swbd_sentiment/asr1/local/score_f1.py
new file mode 100755
index 00000000000..a36c37c7b1f
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/score_f1.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+# Copyright 2022  Yushi Ueda
+#           2022  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import re
+import sys
+import pandas as pd
+import argparse
+from sklearn.metrics import f1_score
+
+
+def get_classification_result(hyp_file, ref_file):
+    hyp_lines = [line for line in hyp_file]
+    ref_lines = [line for line in ref_file]
+    hyp_list = []
+    ref_list = []
+    for line_count in range(len(hyp_lines)):
+        hyp_list.append(hyp_lines[line_count].split(" ")[0])
+        ref_list.append(ref_lines[line_count].split(" ")[0])
+    macro_f1 = f1_score(
+        ref_list, hyp_list, average="macro", labels=["Positive", "Neutral", "Negative"]
+    )
+    weighted_f1 = f1_score(
+        ref_list,
+        hyp_list,
+        average="weighted",
+        labels=["Positive", "Neutral", "Negative"],
+    )
+    micro_f1 = f1_score(
+        ref_list, hyp_list, average="micro", labels=["Positive", "Neutral", "Negative"]
+    )
+    return macro_f1, weighted_f1, micro_f1
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--exp_root", required=True, help="Directory to save experiments")
+parser.add_argument(
+    "--valid_folder",
+    default="inference_asr_model_valid.acc.ave_10best/devel/",
+    help="Directory inside exp_root containing inference on valid set",
+)
+parser.add_argument(
+    "--test_folder",
+    default="inference_asr_model_valid.acc.ave_10best/test/",
+    help="Directory inside exp_root containing inference on test set",
+)
+parser.add_argument(
+    "--utterance_test_folder",
+    default=None,
+    help="Directory inside exp_root containing inference on utterance test set",
+)
+
+args = parser.parse_args()
+
+exp_root = args.exp_root
+valid_inference_folder = args.valid_folder
+test_inference_folder = args.test_folder
+
+valid_hyp_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/hyp.trn")
+)
+valid_ref_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/ref.trn")
+)
+
+macro_f1, weighted_f1, micro_f1 = get_classification_result(
+    valid_hyp_file, valid_ref_file
+)
+print("Valid Intent Classification Result")
+print(
+    "macro f1:{}, weighted f1:{}, micro f1:{}".format(macro_f1, weighted_f1, micro_f1)
+)
+
+test_hyp_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/hyp.trn")
+)
+test_ref_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/ref.trn")
+)
+
+macro_f1, weighted_f1, micro_f1 = get_classification_result(
+    test_hyp_file, test_ref_file
+)
+print("Test Intent Classification Result")
+print(
+    "macro f1:{}, weighted f1:{}, micro f1:{}".format(macro_f1, weighted_f1, micro_f1)
+)
+
+if args.utterance_test_folder is not None:
+    utt_test_inference_folder = args.utterance_test_folder
+    utt_test_hyp_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/hyp.trn")
+    )
+    utt_test_ref_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref.trn")
+    )
+    macro_f1, weighted_f1, micro_f1 = get_classification_result(
+        utt_test_hyp_file, utt_test_ref_file
+    )
+    print("Unseen Utterance Test Intent Classification Result")
+    print(
+        "macro f1:{}, weighted f1:{}, micro f1:{}".format(
+            macro_f1, weighted_f1, micro_f1
+        )
+    )
diff --git a/egs2/swbd_sentiment/asr1/local/swbd1_data_download.sh b/egs2/swbd_sentiment/asr1/local/swbd1_data_download.sh
new file mode 120000
index 00000000000..dfc7b6be51e
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/swbd1_data_download.sh
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_data_download.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/swbd1_data_prep.sh b/egs2/swbd_sentiment/asr1/local/swbd1_data_prep.sh
new file mode 120000
index 00000000000..2c88651a694
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/swbd1_data_prep.sh
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_data_prep.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/swbd1_fix_speakerid.pl b/egs2/swbd_sentiment/asr1/local/swbd1_fix_speakerid.pl
new file mode 120000
index 00000000000..895ea088e4a
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/swbd1_fix_speakerid.pl
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_fix_speakerid.pl
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/swbd1_map_words.pl b/egs2/swbd_sentiment/asr1/local/swbd1_map_words.pl
new file mode 120000
index 00000000000..7c0014e683a
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/swbd1_map_words.pl
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_map_words.pl
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/swbd1_prepare_dict.sh b/egs2/swbd_sentiment/asr1/local/swbd1_prepare_dict.sh
new file mode 120000
index 00000000000..88fa6f959b1
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/swbd1_prepare_dict.sh
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_prepare_dict.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/path.sh b/egs2/swbd_sentiment/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/pyscripts b/egs2/swbd_sentiment/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/run.sh b/egs2/swbd_sentiment/asr1/run.sh
new file mode 100755
index 00000000000..498511133e9
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/run.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="dev"
+test_sets="test dev"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --use_lm false \
+    --nbpe 5000 \
+    --token_type word\
+    --feats_type raw\
+    --max_wav_duration 30 \
+    --inference_nj 8 \
+    --inference_asr_model valid.acc.ave_10best.pth\
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --feats_normalize "utterance_mvn" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" "$@"
diff --git a/egs2/swbd_sentiment/asr1/scripts b/egs2/swbd_sentiment/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/steps b/egs2/swbd_sentiment/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/utils b/egs2/swbd_sentiment/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/espnet2/mt/espnet_model.py b/espnet2/mt/espnet_model.py
index 57f1ad83658..f93b5d417b2 100644
--- a/espnet2/mt/espnet_model.py
+++ b/espnet2/mt/espnet_model.py
@@ -54,6 +54,8 @@ def __init__(
         sym_space: str = "<space>",
         sym_blank: str = "<blank>",
         extract_feats_in_collect_stats: bool = True,
+        share_decoder_input_output_embed: bool = False,
+        share_encoder_decoder_input_embed: bool = False,
     ):
         assert check_argument_types()
 
@@ -66,6 +68,29 @@ def __init__(
         self.ignore_id = ignore_id
         self.token_list = token_list.copy()
 
+        if share_decoder_input_output_embed:
+            if decoder.output_layer is not None:
+                decoder.output_layer.weight = decoder.embed[0].weight
+                logging.info(
+                    "Decoder input embedding and output linear layer are shared"
+                )
+            else:
+                logging.warning(
+                    "Decoder has no output layer, so it cannot be shared "
+                    "with input embedding"
+                )
+
+        if share_encoder_decoder_input_embed:
+            if src_vocab_size == vocab_size:
+                frontend.embed[0].weight = decoder.embed[0].weight
+                logging.info("Encoder and decoder input embeddings are shared")
+            else:
+                logging.warning(
+                    f"src_vocab_size ({src_vocab_size}) does not equal tgt_vocab_size"
+                    f" ({vocab_size}), so the encoder and decoder input embeddings "
+                    "cannot be shared"
+                )
+
         self.frontend = frontend
         self.preencoder = preencoder
         self.postencoder = postencoder
diff --git a/espnet2/st/espnet_model.py b/espnet2/st/espnet_model.py
index cc234e6f275..f4d59d1a0cc 100644
--- a/espnet2/st/espnet_model.py
+++ b/espnet2/st/espnet_model.py
@@ -413,7 +413,7 @@ def _calc_asr_att_loss(
         ys_in_lens = ys_pad_lens + 1
 
         # 1. Forward decoder
-        decoder_out, _ = self.decoder(
+        decoder_out, _ = self.extra_asr_decoder(
             encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
         )