diff --git a/ci/install.sh b/ci/install.sh
index eeb531d7ddd..5bfed7584ad 100755
--- a/ci/install.sh
+++ b/ci/install.sh
@@ -21,7 +21,7 @@ ${CXX:-g++} -v
     . ./activate_python.sh
     make TH_VERSION="${TH_VERSION}"
 
-    make warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done
+    make warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done longformer.done
     rm -rf kaldi
 )
 . tools/activate_python.sh
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/score_summarization.py b/egs2/TEMPLATE/asr1/pyscripts/utils/score_summarization.py
new file mode 100644
index 00000000000..35202f1ce88
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/score_summarization.py
@@ -0,0 +1,50 @@
+import sys
+import os
+from datasets import load_metric
+import numpy as np
+from nlgeval import compute_metrics
+from nlgeval import NLGEval
+
+
+ref_file = sys.argv[1]
+hyp_file = sys.argv[2]
+
+with open(ref_file, "r") as f:
+    ref_dict = {
+        line.strip().split(" ")[0]: " ".join(line.strip().split(" ")[1:])
+        for line in f.readlines()
+    }
+
+with open(hyp_file, "r") as f:
+    hyp_dict = {
+        line.strip().split(" ")[0]: " ".join(line.strip().split(" ")[1:])
+        for line in f.readlines()
+    }
+
+keys = [k for k, v in hyp_dict.items()]
+labels = [ref_dict[k] for k, _ in hyp_dict.items()]
+decoded_preds = [v for k, v in hyp_dict.items()]
+
+metric = load_metric("bertscore")
+result_bert = metric.compute(
+    predictions=decoded_preds,
+    references=labels,
+    lang="en",
+)
+
+
+nlg = NLGEval()  # loads the models
+print("Key", "\t", "METEOR", "\t", "ROUGE-L")
+for (key, ref, hyp) in zip(keys, labels, decoded_preds):
+    metrics_dict = nlg.compute_individual_metrics([ref], hyp)
+    print(key, "\t", metrics_dict["METEOR"], "\t", metrics_dict["ROUGE_L"])
+refs = [[x] for x in labels]
+metrics_dict = nlg.compute_metrics(ref_list=[labels], hyp_list=decoded_preds)
+metric = load_metric("rouge")
+result = metric.compute(predictions=decoded_preds, references=labels)
+result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
+
+print(
+    f"RESULT {result['rouge1']} {result['rouge2']} {result['rougeL']} \
+    {metrics_dict['METEOR']*100.0} {100*np.mean(result_bert['precision'])}"
+)
diff --git a/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh b/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh
index afa768bf5d5..9b8abb9d658 100755
--- a/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh
+++ b/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh
@@ -44,7 +44,16 @@ cat << EOF
 EOF
 
 while IFS= read -r expdir; do
-    if ls "${expdir}"/*/*/score_*/result.txt &> /dev/null; then
+    
+      if ls "${expdir}"/*/*/result.sum &> /dev/null; then
+	echo "## $(basename ${expdir})"
+	cat << EOF
+|dataset|ROUGE-1|ROUGE-2|ROUGE-L|METEOR|BERTScore|
+|---|---|---|---|---|---|
+EOF
+	grep -H -e "RESULT" "${expdir}"/*/*/result.sum | sed 's=RESULT==g' |  cut -d ' ' -f 1,2- | tr ' ' '|'
+	echo  
+      elif ls "${expdir}"/*/*/score_*/result.txt &> /dev/null; then
         echo "## $(basename ${expdir})"
         for type in wer cer ter; do
                 	cat << EOF
diff --git a/egs2/how2/asr1/cmd.sh b/egs2/how2/asr1/cmd.sh
deleted file mode 100644
index 2aae6919fef..00000000000
--- a/egs2/how2/asr1/cmd.sh
+++ /dev/null
@@ -1,110 +0,0 @@
-# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
-# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
-# e.g.
-#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
-#
-# Options:
-#   --time <time>: Limit the maximum time to execute.
-#   --mem <mem>: Limit the maximum memory usage.
-#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
-#   --num-threads <ngpu>: Specify the number of CPU core.
-#   --gpu <ngpu>: Specify the number of GPU devices.
-#   --config: Change the configuration file from default.
-#
-# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
-# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
-# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
-# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
-#
-# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
-# These options are mapping to specific options for each backend and
-# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
-# If jobs failed, your configuration might be wrong for your environment.
-#
-#
-# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
-#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
-# =========================================================~
-
-
-# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
-cmd_backend='local'
-
-# Local machine, without any Job scheduling system
-if [ "${cmd_backend}" = local ]; then
-
-    # The other usage
-    export train_cmd="run.pl"
-    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
-    export cuda_cmd="run.pl"
-    # Used for "*_recog.py"
-    export decode_cmd="run.pl"
-
-# Local machine logging to stdout and log file, without any Job scheduling system
-elif [ "${cmd_backend}" = stdout ]; then
-
-    # The other usage
-    export train_cmd="stdout.pl"
-    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
-    export cuda_cmd="stdout.pl"
-    # Used for "*_recog.py"
-    export decode_cmd="stdout.pl"
-
-
-# "qsub" (Sun Grid Engine, or derivation of it)
-elif [ "${cmd_backend}" = sge ]; then
-    # The default setting is written in conf/queue.conf.
-    # You must change "-q g.q" for the "queue" for your environment.
-    # To know the "queue" names, type "qhost -q"
-    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
-
-    export train_cmd="queue.pl"
-    export cuda_cmd="queue.pl"
-    export decode_cmd="queue.pl"
-
-
-# "qsub" (Torque/PBS.)
-elif [ "${cmd_backend}" = pbs ]; then
-    # The default setting is written in conf/pbs.conf.
-
-    export train_cmd="pbs.pl"
-    export cuda_cmd="pbs.pl"
-    export decode_cmd="pbs.pl"
-
-
-# "sbatch" (Slurm)
-elif [ "${cmd_backend}" = slurm ]; then
-    # The default setting is written in conf/slurm.conf.
-    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
-    # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
-    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
-
-    export train_cmd="slurm.pl"
-    export cuda_cmd="slurm.pl"
-    export decode_cmd="slurm.pl"
-
-elif [ "${cmd_backend}" = ssh ]; then
-    # You have to create ".queue/machines" to specify the host to execute jobs.
-    # e.g. .queue/machines
-    #   host1
-    #   host2
-    #   host3
-    # Assuming you can login them without any password, i.e. You have to set ssh keys.
-
-    export train_cmd="ssh.pl"
-    export cuda_cmd="ssh.pl"
-    export decode_cmd="ssh.pl"
-
-# This is an example of specifying several unique options in the JHU CLSP cluster setup.
-# Users can modify/add their own command options according to their cluster environments.
-elif [ "${cmd_backend}" = jhu ]; then
-
-    export train_cmd="queue.pl --mem 2G"
-    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
-    export decode_cmd="queue.pl --mem 4G"
-
-else
-    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
-    return 1
-fi
diff --git a/egs2/how2/asr1/cmd.sh b/egs2/how2/asr1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/how2/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/README.md b/egs2/how2_2000h/asr1/README.md
new file mode 100644
index 00000000000..b310f8cfb71
--- /dev/null
+++ b/egs2/how2_2000h/asr1/README.md
@@ -0,0 +1,30 @@
+## End to End Speech Recognition
+
+This recipe can be used to build E2E Speech Summarization models using restricted self-attention on the HowTo corpus of instructional videos. 
+
+HowTo 2000h fbank-pitch features have been released to enable reproduction of this recipe. 
+
+#Results on ASR
+
+
+## asr_base_conformer_lf_mix
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|55215|93.1|4.8|2.1|1.9|8.8|56.7|
+|decode_asr_model_valid.acc.best/held_out_test|2761|47348|92.7|5.0|2.3|2.2|9.5|54.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|276377|97.1|1.1|1.9|1.9|4.8|56.7|
+|decode_asr_model_valid.acc.best/held_out_test|2761|236575|96.8|1.2|2.0|2.1|5.4|54.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|82484|94.1|3.5|2.4|2.2|8.0|56.7|
+|decode_asr_model_valid.acc.best/held_out_test|2761|70264|93.9|3.7|2.4|2.7|8.9|54.6|
diff --git a/egs2/how2_2000h/asr1/asr.sh b/egs2/how2_2000h/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/how2_2000h/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/cmd.sh b/egs2/how2_2000h/asr1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/how2_2000h/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/conf/decode_asr.yaml b/egs2/how2_2000h/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..fa4714a6f43
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_ctc.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/conf/fbank.conf b/egs2/how2_2000h/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/how2_2000h/asr1/conf/pbs.conf b/egs2/how2_2000h/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/how2_2000h/asr1/conf/pitch.conf b/egs2/how2_2000h/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/how2_2000h/asr1/conf/queue.conf b/egs2/how2_2000h/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/how2_2000h/asr1/conf/slurm.conf b/egs2/how2_2000h/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/how2_2000h/asr1/conf/train_asr_conformer_lf.yaml b/egs2/how2_2000h/asr1/conf/train_asr_conformer_lf.yaml
new file mode 120000
index 00000000000..ee7d1d03dbc
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/train_asr_conformer_lf.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_vid_ctc_lf.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/conf/tuning/decode.yaml b/egs2/how2_2000h/asr1/conf/tuning/decode.yaml
new file mode 100644
index 00000000000..519477f7a43
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/tuning/decode.yaml
@@ -0,0 +1,7 @@
+beam_size: 4
+batch_size: 1
+penalty: 0.0
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 0.3
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/asr1/conf/tuning/decode_ctc.yaml b/egs2/how2_2000h/asr1/conf/tuning/decode_ctc.yaml
new file mode 100755
index 00000000000..03fdd93249f
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/tuning/decode_ctc.yaml
@@ -0,0 +1,7 @@
+beam_size: 20
+batch_size: 1
+penalty: 0.1
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 1.0
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/asr1/conf/tuning/pretrain_hubert.yaml b/egs2/how2_2000h/asr1/conf/tuning/pretrain_hubert.yaml
new file mode 100644
index 00000000000..f21213f3421
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/tuning/pretrain_hubert.yaml
@@ -0,0 +1,58 @@
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 150000000
+accum_grad: 1
+max_epoch: 400
+patience: none
+# Use self-defined function for initialization
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+input_size: 768
+encoder: avhubert_pretrain
+encoder_conf:
+    output_size: 768
+    linear_units: 3072
+    attention_heads: 8
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    dropout_input: 0.1
+    dropout_features: 0.1
+    skip_masked: false
+    skip_nomask: false
+    mask_prob: 0.80
+    extractor_mode: default
+    conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+    final_dim: 256
+    encoder_layerdrop: 0.05
+    feature_grad_mult: 0.1
+    untie_final_proj: true
+    label_rate: 100
+    sample_rate: 16000
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    pred_masked_weight: 1.0
+    pred_nomask_weight: 0.0
+    loss_weights: 10.0
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+unused_parameters: true
+
+frontend: null
+
+normalize: null
+
+specaug: null
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml b/egs2/how2_2000h/asr1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml
new file mode 100644
index 00000000000..606081aa9b1
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml
@@ -0,0 +1,78 @@
+# This configuration requires Tesla V100-SXM2(32GB) x 8 GPUs It takes about 4 days.
+batch_type: length
+batch_bins: 60000000
+accum_grad: 10
+max_epoch: 100
+patience: none
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+log_interval: 5000
+encoder: longformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "abs_pos"
+    selfattention_layer_type: "lf_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+    attention_windows: [20,20,20,20,20,20,40,40,40,40,40,40]
+    attention_dilation: [1,1,1,1,1,1,1,1,1,1,1,1]
+    attention_mode: tvm
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 512
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr
+scheduler_conf: 
+        warmup_steps: 25000
+        
+ctc_conf:
+        ignore_nan_grad: true
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+unused_parameters: True 
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/how2_2000h/asr1/db.sh b/egs2/how2_2000h/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/how2_2000h/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/data.sh b/egs2/how2_2000h/asr1/local/data.sh
new file mode 100755
index 00000000000..ffd918d7d09
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=0
+stop_stage=1
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+url_how2_2000="https://drive.google.com/file/d/1SHg7La_hflMTIm6gaCus46sn4zYqWJvb/view?usp=sharing"
+data_how2=how2_feats
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Data download"
+
+    if [ -d ${data_how2} ]; then
+        log "$0: HowTo directory or archive already exists in ${data_how2}. Skipping download."
+    else
+        ../../../utils/download_from_google_drive.sh ${url_how2_2000} $PWD tar.gz
+        log "$0: Successfully downloaded and un-tarred how2_feats.tar.gz"
+    fi
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data preparation and verification"
+    mv how2_feats/data .
+    mv how2_feats/fbank .
+fi 
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/how2_2000h/asr1/local/data_normalization/nlsyms b/egs2/how2_2000h/asr1/local/data_normalization/nlsyms
new file mode 100644
index 00000000000..8497d1e0046
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data_normalization/nlsyms
@@ -0,0 +1,2 @@
+(h|H)(m|M)+ hesmark
+(U|u)(m|M)+ hesmark
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/data_normalization/shortened b/egs2/how2_2000h/asr1/local/data_normalization/shortened
new file mode 100644
index 00000000000..305228debf3
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data_normalization/shortened
@@ -0,0 +1,4 @@
+Dr. Doctor
+Mr. Mister
+Ms. Miss
+No.1 Number one
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/data_normalization/symbols b/egs2/how2_2000h/asr1/local/data_normalization/symbols
new file mode 100644
index 00000000000..915b7a23070
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data_normalization/symbols
@@ -0,0 +1,6 @@
+¾ 3/4
+½ 1/2
+% percent
+[+] plus
+= equal
+&[^;]*; and
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/data_normalization/url b/egs2/how2_2000h/asr1/local/data_normalization/url
new file mode 100644
index 00000000000..ca215839a83
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data_normalization/url
@@ -0,0 +1,5 @@
+@ at
+www[.] www dot
+[.]com[/] dot com slash
+[.]com dot com
+[.]org dot org
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/path.sh b/egs2/how2_2000h/asr1/local/path.sh
new file mode 100755
index 00000000000..a0b8041dfb2
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/path.sh
@@ -0,0 +1,19 @@
+# check extra module installation
+if ! python -c 'import longformer; import nlgeval; import datasets' > /dev/null; then
+    echo "Error: it seems that longformer is not installed." >&2
+    echo "Error: please install longformer as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
+if ! python -c 'import nlgeval' > /dev/null; then
+    echo "Error: it seems that nlgeval is not installed." >&2
+    echo "Error: please install nlgeval as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
+if ! python -c 'import datasets' > /dev/null; then
+    echo "Error: it seems that datasets is not installed." >&2
+    echo "Error: please install datasets as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
diff --git a/egs2/how2_2000h/asr1/path.sh b/egs2/how2_2000h/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/how2_2000h/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/pyscripts b/egs2/how2_2000h/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/how2_2000h/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/run.sh b/egs2/how2_2000h/asr1/run.sh
new file mode 100755
index 00000000000..7ff75326ed3
--- /dev/null
+++ b/egs2/how2_2000h/asr1/run.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+train_set="tr_2000h_utt"
+valid_set="cv05_utt"
+test_sets="dev5_test_utt"
+
+asr_config=conf/train_asr_conformer_lf.yaml
+inference_config=conf/decode_asr.yaml
+
+feats_type=extracted
+
+token_type=bpe
+
+nlsyms=data/nlsyms
+nbpe=1000
+bpe_nlsyms="[hes]"
+
+use_lm=false
+
+
+
+./asr.sh \
+    --lang en \
+    --feats_type ${feats_type} \
+    --token_type ${token_type} \
+    --nbpe ${nbpe} \
+    --nlsyms_txt ${nlsyms} \
+    --bpe_nlsyms ${bpe_nlsyms} \
+    --use_lm ${use_lm} \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --bpe_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/how2_2000h/asr1/scripts b/egs2/how2_2000h/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/how2_2000h/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/steps b/egs2/how2_2000h/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/how2_2000h/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/utils b/egs2/how2_2000h/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/how2_2000h/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/README.md b/egs2/how2_2000h/sum1/README.md
new file mode 100644
index 00000000000..4b4c1fcc165
--- /dev/null
+++ b/egs2/how2_2000h/sum1/README.md
@@ -0,0 +1,73 @@
+## End to End Speech Summarization
+
+This recipe can be used to build E2E Speech Summarization models using restricted self-attention on the HowTo corpus of instructional videos. 
+
+HowTo 2000h fbank-pitch features have been released to enable reproduction of this recipe. 
+
+Training is done in two stages, (a) ASR Pretraining, and (b) Summarization fine-tuning
+
+First run ASR pretraining as follows:
+The recipe is based on asr1
+```bash
+local/run_asr.sh --asr_tag asr_pretrain
+``` 
+Then run the finetuning on summarization using the previously trained model as the initialization
+
+```bash
+./run.sh --asr_tag sum_finetune --asr_args "--init_param exp/asr_asr_pretrain/valid.acc.ave_10best.pth:::ctc"
+```
+
+#Results on ASR
+
+
+## asr_base_conformer_lf_mix
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|55215|93.1|4.8|2.1|1.9|8.8|56.7|
+|decode_asr_model_valid.acc.best/held_out_test|2761|47348|92.7|5.0|2.3|2.2|9.5|54.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|276377|97.1|1.1|1.9|1.9|4.8|56.7|
+|decode_asr_model_valid.acc.best/held_out_test|2761|236575|96.8|1.2|2.0|2.1|5.4|54.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|82484|94.1|3.5|2.4|2.2|8.0|56.7|
+|decode_asr_model_valid.acc.best/held_out_test|2761|70264|93.9|3.7|2.4|2.7|8.9|54.6|
+
+
+
+#Results on Summarization
+
+## asr_ft_sum
+### SUMM
+- Model link: [huggingface](https://huggingface.co/espnet/roshansh_how2_asr_raw_ft_sum_valid.acc)
+- ASR config: [./conf/train_sum_conformer_lf.yaml](./conf/train_sum_conformer_lf.yaml)
+- Inference config: [./conf/decode_sum.yaml](./conf/decode_sum.yaml)
+
+|dataset|Snt|Wrd|ROUGE-1|ROUGE-2|ROUGE-L|METEOR|BERTScore|
+|---|---|---|---|---|---|---|---|
+|decode_sum_asr_model_valid.acc.best/dev5_test_sum|2127|69795|60.72|44.7|56.1|29.36|91.53|
+
+
+
+Please cite the following paper if you use this recipe:
+```Bibtex
+@misc{sharma2022speech,
+      title={Speech Summarization using Restricted Self-Attention}, 
+      author={Roshan Sharma and Shruti Palaskar and Alan W Black and Florian Metze},
+      year={2022},
+      eprint={2110.06263},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+```
diff --git a/egs2/how2_2000h/sum1/asr.sh b/egs2/how2_2000h/sum1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/how2_2000h/sum1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/cmd.sh b/egs2/how2_2000h/sum1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/how2_2000h/sum1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/decode_asr.yaml b/egs2/how2_2000h/sum1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..fa4714a6f43
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_ctc.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/decode_sum.yaml b/egs2/how2_2000h/sum1/conf/decode_sum.yaml
new file mode 120000
index 00000000000..27c573f341b
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/decode_sum.yaml
@@ -0,0 +1 @@
+tuning/decode_sum.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/fbank.conf b/egs2/how2_2000h/sum1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/how2_2000h/sum1/conf/pbs.conf b/egs2/how2_2000h/sum1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/how2_2000h/sum1/conf/pitch.conf b/egs2/how2_2000h/sum1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/how2_2000h/sum1/conf/queue.conf b/egs2/how2_2000h/sum1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/how2_2000h/sum1/conf/slurm.conf b/egs2/how2_2000h/sum1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/how2_2000h/sum1/conf/train_asr_conformer_lf.yaml b/egs2/how2_2000h/sum1/conf/train_asr_conformer_lf.yaml
new file mode 120000
index 00000000000..ee7d1d03dbc
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/train_asr_conformer_lf.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_vid_ctc_lf.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/train_sum_conformer_lf.yaml b/egs2/how2_2000h/sum1/conf/train_sum_conformer_lf.yaml
new file mode 120000
index 00000000000..ba6ab56ca56
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/train_sum_conformer_lf.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_vid_lf.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/tuning/decode.yaml b/egs2/how2_2000h/sum1/conf/tuning/decode.yaml
new file mode 100644
index 00000000000..519477f7a43
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/decode.yaml
@@ -0,0 +1,7 @@
+beam_size: 4
+batch_size: 1
+penalty: 0.0
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 0.3
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/sum1/conf/tuning/decode_ctc.yaml b/egs2/how2_2000h/sum1/conf/tuning/decode_ctc.yaml
new file mode 100755
index 00000000000..03fdd93249f
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/decode_ctc.yaml
@@ -0,0 +1,7 @@
+beam_size: 20
+batch_size: 1
+penalty: 0.1
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 1.0
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/sum1/conf/tuning/decode_sum.yaml b/egs2/how2_2000h/sum1/conf/tuning/decode_sum.yaml
new file mode 100755
index 00000000000..4682af74153
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/decode_sum.yaml
@@ -0,0 +1,7 @@
+beam_size: 4
+batch_size: 1
+penalty: 0.1
+minlenratio: 0.01
+maxlenratio: 0.2
+ctc_weight: 0.0
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/sum1/conf/tuning/pretrain_hubert.yaml b/egs2/how2_2000h/sum1/conf/tuning/pretrain_hubert.yaml
new file mode 100644
index 00000000000..f21213f3421
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/pretrain_hubert.yaml
@@ -0,0 +1,58 @@
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 150000000
+accum_grad: 1
+max_epoch: 400
+patience: none
+# Use self-defined function for initialization
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+input_size: 768
+encoder: avhubert_pretrain
+encoder_conf:
+    output_size: 768
+    linear_units: 3072
+    attention_heads: 8
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    dropout_input: 0.1
+    dropout_features: 0.1
+    skip_masked: false
+    skip_nomask: false
+    mask_prob: 0.80
+    extractor_mode: default
+    conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+    final_dim: 256
+    encoder_layerdrop: 0.05
+    feature_grad_mult: 0.1
+    untie_final_proj: true
+    label_rate: 100
+    sample_rate: 16000
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    pred_masked_weight: 1.0
+    pred_nomask_weight: 0.0
+    loss_weights: 10.0
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+unused_parameters: true
+
+frontend: null
+
+normalize: null
+
+specaug: null
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml b/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml
new file mode 100644
index 00000000000..606081aa9b1
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml
@@ -0,0 +1,78 @@
+# This configuration requires Tesla V100-SXM2(32GB) x 8 GPUs It takes about 4 days.
+batch_type: length
+batch_bins: 60000000
+accum_grad: 10
+max_epoch: 100
+patience: none
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+log_interval: 5000
+encoder: longformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "abs_pos"
+    selfattention_layer_type: "lf_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+    attention_windows: [20,20,20,20,20,20,40,40,40,40,40,40]
+    attention_dilation: [1,1,1,1,1,1,1,1,1,1,1,1]
+    attention_mode: tvm
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 512
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr
+scheduler_conf: 
+        warmup_steps: 25000
+        
+ctc_conf:
+        ignore_nan_grad: true
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+unused_parameters: True 
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_lf.yaml b/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_lf.yaml
new file mode 100644
index 00000000000..f0454207ee5
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_lf.yaml
@@ -0,0 +1,80 @@
+# This configuration requires Tesla V100-SXM2(32GB) x 8 GPUs It takes about 2 days.
+batch_bins: 200000
+batch_type: length
+accum_grad: 10
+max_epoch: 100
+patience: 10
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+log_interval: 5000
+encoder: longformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "abs_pos"
+    selfattention_layer_type: "lf_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+    attention_windows: [40,40,40,40,40,40,40,40,60,60,60,60]
+    attention_dilation: [1,1,1,1,1,1,1,1,1,1,1,1]
+    attention_mode: tvm
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 512
+    num_blocks: 6
+    dropout_rate: 0.15
+    positional_dropout_rate: 0.15
+    self_attention_dropout_rate: 0.15
+    src_attention_dropout_rate: 0.15
+
+model_conf:
+    ctc_weight: 0.0
+    lsm_weight: 0.15
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0001
+scheduler: reducelronplateau
+scheduler_conf:
+        mode: min
+        factor: 0.5
+        patience: 1
+    #scheduler: warmuplr
+    #scheduler_conf: 
+    #    warmup_steps: 25000
+        
+ctc_conf:
+        ignore_nan_grad: true
+
+unused_parameters: True 
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/how2_2000h/sum1/db.sh b/egs2/how2_2000h/sum1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/how2_2000h/sum1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/data.sh b/egs2/how2_2000h/sum1/local/data.sh
new file mode 100755
index 00000000000..ffd918d7d09
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=0
+stop_stage=1
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+url_how2_2000="https://drive.google.com/file/d/1SHg7La_hflMTIm6gaCus46sn4zYqWJvb/view?usp=sharing"
+data_how2=how2_feats
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Data download"
+
+    if [ -d ${data_how2} ]; then
+        log "$0: HowTo directory or archive already exists in ${data_how2}. Skipping download."
+    else
+        ../../../utils/download_from_google_drive.sh ${url_how2_2000} $PWD tar.gz
+        log "$0: Successfully downloaded and un-tarred how2_feats.tar.gz"
+    fi
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data preparation and verification"
+    mv how2_feats/data .
+    mv how2_feats/fbank .
+fi 
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/how2_2000h/sum1/local/data_normalization/nlsyms b/egs2/how2_2000h/sum1/local/data_normalization/nlsyms
new file mode 100644
index 00000000000..8497d1e0046
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data_normalization/nlsyms
@@ -0,0 +1,2 @@
+(h|H)(m|M)+ hesmark
+(U|u)(m|M)+ hesmark
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/data_normalization/shortened b/egs2/how2_2000h/sum1/local/data_normalization/shortened
new file mode 100644
index 00000000000..305228debf3
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data_normalization/shortened
@@ -0,0 +1,4 @@
+Dr. Doctor
+Mr. Mister
+Ms. Miss
+No.1 Number one
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/data_normalization/symbols b/egs2/how2_2000h/sum1/local/data_normalization/symbols
new file mode 100644
index 00000000000..915b7a23070
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data_normalization/symbols
@@ -0,0 +1,6 @@
+¾ 3/4
+½ 1/2
+% percent
+[+] plus
+= equal
+&[^;]*; and
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/data_normalization/url b/egs2/how2_2000h/sum1/local/data_normalization/url
new file mode 100644
index 00000000000..ca215839a83
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data_normalization/url
@@ -0,0 +1,5 @@
+@ at
+www[.] www dot
+[.]com[/] dot com slash
+[.]com dot com
+[.]org dot org
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/path.sh b/egs2/how2_2000h/sum1/local/path.sh
new file mode 100755
index 00000000000..a0b8041dfb2
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/path.sh
@@ -0,0 +1,19 @@
+# check extra module installation
+if ! python -c 'import longformer; import nlgeval; import datasets' > /dev/null; then
+    echo "Error: it seems that longformer is not installed." >&2
+    echo "Error: please install longformer as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
+if ! python -c 'import nlgeval' > /dev/null; then
+    echo "Error: it seems that nlgeval is not installed." >&2
+    echo "Error: please install nlgeval as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
+if ! python -c 'import datasets' > /dev/null; then
+    echo "Error: it seems that datasets is not installed." >&2
+    echo "Error: please install datasets as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
diff --git a/egs2/how2_2000h/sum1/local/run_asr.sh b/egs2/how2_2000h/sum1/local/run_asr.sh
new file mode 120000
index 00000000000..8d5b78f2cf0
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/run_asr.sh
@@ -0,0 +1 @@
+../../asr1/run.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/score.sh b/egs2/how2_2000h/sum1/local/score.sh
new file mode 100755
index 00000000000..da549ebcc62
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/score.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Copyright 2021 Carnegie Mellon University (Author : Roshan Sharma)
+
+## begin configuration section.
+data=data/dev5_test_sum
+# end configuration section.
+
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+
+if [ $# -lt 1 ]; then
+  echo "Usage: local/score.sh <asr-exp-dir>"
+  exit 1;
+fi
+
+
+asr_expdir=$1
+
+name=$(basename ${data}) # e.g. dev5_test
+echo "${asr_expdir}/decode_*/${name}"
+for dir in ${asr_expdir}/decode_*/${name}; do
+    python pyscripts/utils/score_summarization.py $data/text $dir/text $(echo $dir | sed 's/exp//g') > $dir/result.sum
+done   
diff --git a/egs2/how2_2000h/sum1/path.sh b/egs2/how2_2000h/sum1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/how2_2000h/sum1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/pyscripts b/egs2/how2_2000h/sum1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/how2_2000h/sum1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/run.sh b/egs2/how2_2000h/sum1/run.sh
new file mode 100755
index 00000000000..5acfc2abc59
--- /dev/null
+++ b/egs2/how2_2000h/sum1/run.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+train_set="tr_2000h_sum"
+valid_set="cv05_sum"
+test_sets="dev5_test_sum"
+asr_config=conf/train_sum_conformer_lf.yaml
+inference_config=conf/decode_sum.yaml
+
+feats_type=extracted
+
+token_type=bpe
+
+nlsyms=data/nlsyms
+nbpe=1000
+bpe_nlsyms="[hes]"
+
+use_lm=false
+mdur=100
+
+## Run local/run_asr.sh to pretrain an ASR Model on How2, and fine-tune that model on Speech Summarization
+
+./asr.sh \
+    --lang en \
+    --feats_type ${feats_type} \
+    --token_type ${token_type} \
+    --nbpe ${nbpe} \
+    --nlsyms_txt ${nlsyms} \
+    --bpe_nlsyms ${bpe_nlsyms} \
+    --use_lm ${use_lm} \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --max_wav_duration "$mdur" \
+    --bpe_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/how2_2000h/sum1/scripts b/egs2/how2_2000h/sum1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/how2_2000h/sum1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/steps b/egs2/how2_2000h/sum1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/how2_2000h/sum1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/utils b/egs2/how2_2000h/sum1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/how2_2000h/sum1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/espnet/nets/pytorch_backend/transformer/longformer_attention.py b/espnet/nets/pytorch_backend/transformer/longformer_attention.py
new file mode 100644
index 00000000000..82a54c801d1
--- /dev/null
+++ b/espnet/nets/pytorch_backend/transformer/longformer_attention.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 Roshan Sharma (Carnegie Mellon University)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Longformer based Local Attention Definition."""
+
+from longformer.longformer import LongformerConfig
+from longformer.longformer import LongformerSelfAttention
+from torch import nn
+
+
+class LongformerAttention(nn.Module):
+    """Longformer based Local Attention Definition."""
+
+    def __init__(self, config: LongformerConfig, layer_id: int):
+        """Compute Longformer based Self-Attention.
+
+        Args:
+            config : Longformer attention configuration
+            layer_id: Integer representing the layer index
+        """
+        super().__init__()
+        self.attention_window = config.attention_window[layer_id]
+        self.attention_layer = LongformerSelfAttention(config, layer_id=layer_id)
+        self.attention = None
+
+    def forward(self, query, key, value, mask):
+        """Compute Longformer Self-Attention with masking.
+
+        Expects `len(hidden_states)` to be multiple of `attention_window`.
+        Padding to `attention_window` happens in :meth:`encoder.forward`
+        to avoid redoing the padding on each layer.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, 2*time1-1, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        attention_mask = mask.int()
+        attention_mask[mask == 0] = -1
+        attention_mask[mask == 1] = 0
+        output, self.attention = self.attention_layer(
+            hidden_states=query,
+            attention_mask=attention_mask.unsqueeze(1),
+            head_mask=None,
+            output_attentions=True,
+        )
+        return output
diff --git a/espnet2/asr/decoder/transformer_decoder.py b/espnet2/asr/decoder/transformer_decoder.py
index cc6d931a772..1bd74cb76c1 100644
--- a/espnet2/asr/decoder/transformer_decoder.py
+++ b/espnet2/asr/decoder/transformer_decoder.py
@@ -128,6 +128,12 @@ def forward(
         memory_mask = (~make_pad_mask(hlens, maxlen=memory.size(1)))[:, None, :].to(
             memory.device
         )
+        # Padding for Longformer
+        if memory_mask.shape[-1] != memory.shape[1]:
+            padlen = memory.shape[1] - memory_mask.shape[-1]
+            memory_mask = torch.nn.functional.pad(
+                memory_mask, (0, padlen), "constant", False
+            )
 
         x = self.embed(tgt)
         x, tgt_mask, memory, memory_mask = self.decoders(
diff --git a/espnet2/asr/encoder/longformer_encoder.py b/espnet2/asr/encoder/longformer_encoder.py
new file mode 100644
index 00000000000..1d9dcfcc864
--- /dev/null
+++ b/espnet2/asr/encoder/longformer_encoder.py
@@ -0,0 +1,374 @@
+# Copyright 2020 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Conformer encoder definition."""
+
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import torch
+
+from typeguard import check_argument_types
+
+from espnet.nets.pytorch_backend.conformer.convolution import ConvolutionModule
+from espnet.nets.pytorch_backend.conformer.encoder_layer import EncoderLayer
+from espnet.nets.pytorch_backend.nets_utils import get_activation
+from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet.nets.pytorch_backend.transformer.embedding import (
+    PositionalEncoding,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.pytorch_backend.transformer.multi_layer_conv import Conv1dLinear
+from espnet.nets.pytorch_backend.transformer.multi_layer_conv import MultiLayeredConv1d
+from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
+    PositionwiseFeedForward,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.repeat import repeat
+from espnet.nets.pytorch_backend.transformer.subsampling import check_short_utt
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling2
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling6
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling8
+from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet2.asr.ctc import CTC
+from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
+
+
+class LongformerEncoder(ConformerEncoder):
+    """Longformer SA Conformer encoder module.
+
+    Args:
+        input_size (int): Input dimension.
+        output_size (int): Dimension of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        attention_dropout_rate (float): Dropout rate in attention.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        input_layer (Union[str, torch.nn.Module]): Input layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            If True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            If False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        rel_pos_type (str): Whether to use the latest relative positional encoding or
+            the legacy one. The legacy relative positional encoding will be deprecated
+            in the future. More Details can be found in
+            https://github.com/espnet/espnet/pull/2816.
+        encoder_pos_enc_layer_type (str): Encoder positional encoding layer type.
+        encoder_attn_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        use_cnn_module (bool): Whether to use convolution module.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+        cnn_module_kernel (int): Kernerl size of convolution module.
+        padding_idx (int): Padding idx for input_layer=embed.
+        attention_windows (list): Layer-wise attention window sizes
+            for longformer self-attn
+        attention_dilation(list): Layer-wise attention dilation sizes
+            for longformer self-attn
+        attention_mode(str): Implementation for longformer self-attn.
+            Default="sliding_chunks"
+            Choose 'n2', 'tvm' or 'sliding_chunks'. More details in
+            https://github.com/allenai/longformer
+
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        normalize_before: bool = True,
+        concat_after: bool = False,
+        positionwise_layer_type: str = "linear",
+        positionwise_conv_kernel_size: int = 3,
+        macaron_style: bool = False,
+        rel_pos_type: str = "legacy",
+        pos_enc_layer_type: str = "abs_pos",
+        selfattention_layer_type: str = "lf_selfattn",
+        activation_type: str = "swish",
+        use_cnn_module: bool = True,
+        zero_triu: bool = False,
+        cnn_module_kernel: int = 31,
+        padding_idx: int = -1,
+        interctc_layer_idx: List[int] = [],
+        interctc_use_conditioning: bool = False,
+        attention_windows: list = [100, 100, 100, 100, 100, 100],
+        attention_dilation: list = [1, 1, 1, 1, 1, 1],
+        attention_mode: str = "sliding_chunks",
+    ):
+        assert check_argument_types()
+        super().__init__(input_size)
+        self._output_size = output_size
+
+        activation = get_activation(activation_type)
+
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        else:
+            raise ValueError(
+                "incorrect or unknown pos_enc_layer: "
+                + pos_enc_layer_type
+                + "Use abs_pos"
+            )
+
+        if len(attention_dilation) != num_blocks:
+            raise ValueError(
+                "incorrect attention_dilation parameter of length"
+                + str(len(attention_dilation))
+                + " does not match num_blocks"
+                + str(num_blocks)
+            )
+
+        if len(attention_windows) != num_blocks:
+            raise ValueError(
+                "incorrect attention_windows parameter of length"
+                + str(len(attention_windows))
+                + " does not match num_blocks"
+                + str(num_blocks)
+            )
+
+        if attention_mode != "tvm" and max(attention_dilation) != 1:
+            raise ValueError(
+                "incorrect attention mode for dilation: "
+                + attention_mode
+                + "Use attention_mode=tvm with Cuda Kernel"
+            )
+
+        if input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(input_size, output_size),
+                torch.nn.LayerNorm(output_size),
+                torch.nn.Dropout(dropout_rate),
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d2":
+            self.embed = Conv2dSubsampling2(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d6":
+            self.embed = Conv2dSubsampling6(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d8":
+            self.embed = Conv2dSubsampling8(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx),
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif isinstance(input_layer, torch.nn.Module):
+            self.embed = torch.nn.Sequential(
+                input_layer,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer is None:
+            self.embed = torch.nn.Sequential(
+                pos_enc_class(output_size, positional_dropout_rate)
+            )
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        self.normalize_before = normalize_before
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                dropout_rate,
+                activation,
+            )
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+        self.selfattention_layer_type = selfattention_layer_type
+        if selfattention_layer_type == "lf_selfattn":
+            assert pos_enc_layer_type == "abs_pos"
+            from espnet.nets.pytorch_backend.transformer.longformer_attention import (
+                LongformerAttention,  # noqa: H301
+            )
+            from longformer.longformer import LongformerConfig
+
+            encoder_selfattn_layer = LongformerAttention
+
+            config = LongformerConfig(
+                attention_window=attention_windows,
+                attention_dilation=attention_dilation,
+                autoregressive=False,
+                num_attention_heads=attention_heads,
+                hidden_size=output_size,
+                attention_probs_dropout_prob=dropout_rate,
+                attention_mode=attention_mode,
+            )
+            encoder_selfattn_layer_args = (config,)
+        else:
+            raise ValueError(
+                "incompatible or unknown encoder_attn_layer: "
+                + selfattention_layer_type
+                + " Use lf_selfattn"
+            )
+
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (output_size, cnn_module_kernel, activation)
+
+        self.encoders = repeat(
+            num_blocks,
+            lambda layer_id: EncoderLayer(
+                output_size,
+                encoder_selfattn_layer(*(encoder_selfattn_layer_args + (layer_id,))),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ),
+        )
+
+        if self.normalize_before:
+            self.after_norm = LayerNorm(output_size)
+
+        self.interctc_layer_idx = interctc_layer_idx
+        if len(interctc_layer_idx) > 0:
+            assert 0 < min(interctc_layer_idx) and max(interctc_layer_idx) < num_blocks
+        self.interctc_use_conditioning = interctc_use_conditioning
+        self.conditioning_layer = None
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        prev_states: torch.Tensor = None,
+        ctc: CTC = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Calculate forward propagation.
+
+        Args:
+            xs_pad (torch.Tensor): Input tensor (#batch, L, input_size).
+            ilens (torch.Tensor): Input length (#batch).
+            prev_states (torch.Tensor): Not to be used now.
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, L, output_size).
+            torch.Tensor: Output length (#batch).
+            torch.Tensor: Not to be used now.
+
+        """
+
+        masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
+        if (
+            isinstance(self.embed, Conv2dSubsampling)
+            or isinstance(self.embed, Conv2dSubsampling2)
+            or isinstance(self.embed, Conv2dSubsampling6)
+            or isinstance(self.embed, Conv2dSubsampling8)
+        ):
+            short_status, limit_size = check_short_utt(self.embed, xs_pad.size(1))
+            if short_status:
+                raise TooShortUttError(
+                    f"has {xs_pad.size(1)} frames and is too short for subsampling "
+                    + f"(it needs more than {limit_size} frames), return empty results",
+                    xs_pad.size(1),
+                    limit_size,
+                )
+            xs_pad, masks = self.embed(xs_pad, masks)
+        else:
+            xs_pad = self.embed(xs_pad)
+
+        if self.selfattention_layer_type == "lf_selfattn":
+            seq_len = xs_pad.shape[1]
+            attention_window = (
+                max([x.self_attn.attention_window for x in self.encoders]) * 2
+            )
+            padding_len = (
+                attention_window - seq_len % attention_window
+            ) % attention_window
+            xs_pad = torch.nn.functional.pad(
+                xs_pad, (0, 0, 0, padding_len), "constant", 0
+            )
+            masks = torch.nn.functional.pad(masks, (0, padding_len), "constant", False)
+
+        xs_pad, masks = self.encoders(xs_pad, masks)
+        intermediate_outs = []
+        if len(self.interctc_layer_idx) == 0:
+            xs_pad, masks = self.encoders(xs_pad, masks)
+        else:
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs_pad, masks = encoder_layer(xs_pad, masks)
+
+                if layer_idx + 1 in self.interctc_layer_idx:
+                    encoder_out = xs_pad
+                    if isinstance(encoder_out, tuple):
+                        encoder_out = encoder_out[0]
+
+                    # intermediate outputs are also normalized
+                    if self.normalize_before:
+                        encoder_out = self.after_norm(encoder_out)
+
+                    intermediate_outs.append((layer_idx + 1, encoder_out))
+
+                    if self.interctc_use_conditioning:
+                        ctc_out = ctc.softmax(encoder_out)
+
+                        if isinstance(xs_pad, tuple):
+                            x, pos_emb = xs_pad
+                            x = x + self.conditioning_layer(ctc_out)
+                            xs_pad = (x, pos_emb)
+                        else:
+                            xs_pad = xs_pad + self.conditioning_layer(ctc_out)
+
+        if isinstance(xs_pad, tuple):
+            xs_pad = xs_pad[0]
+        if self.normalize_before:
+            xs_pad = self.after_norm(xs_pad)
+
+        olens = masks.squeeze(1).sum(1)
+        if len(intermediate_outs) > 0:
+            return (xs_pad, intermediate_outs), olens, None
+        return xs_pad, olens, None
diff --git a/espnet2/tasks/asr.py b/espnet2/tasks/asr.py
index 780aa905697..750c726d77b 100644
--- a/espnet2/tasks/asr.py
+++ b/espnet2/tasks/asr.py
@@ -28,6 +28,8 @@
 from espnet2.asr.decoder.transformer_decoder import TransformerDecoder
 from espnet2.asr.encoder.abs_encoder import AbsEncoder
 from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
+from espnet2.asr.encoder.longformer_encoder import LongformerEncoder
+
 from espnet2.asr.encoder.hubert_encoder import FairseqHubertEncoder
 from espnet2.asr.encoder.hubert_encoder import FairseqHubertPretrainEncoder
 from espnet2.asr.encoder.rnn_encoder import RNNEncoder
@@ -126,6 +128,7 @@
         wav2vec2=FairSeqWav2Vec2Encoder,
         hubert=FairseqHubertEncoder,
         hubert_pretrain=FairseqHubertPretrainEncoder,
+        longformer=LongformerEncoder,
     ),
     type_check=AbsEncoder,
     default="rnn",
diff --git a/test/espnet2/asr/encoder/test_longformer_encoder.py b/test/espnet2/asr/encoder/test_longformer_encoder.py
new file mode 100644
index 00000000000..8df5f5fc212
--- /dev/null
+++ b/test/espnet2/asr/encoder/test_longformer_encoder.py
@@ -0,0 +1,83 @@
+from espnet2.asr.encoder.longformer_encoder import LongformerEncoder
+import pytest
+import torch
+
+pytest.importorskip("longformer")
+
+
+@pytest.mark.parametrize(
+    "input_layer", ["linear", "conv2d", "conv2d2", "conv2d6", "conv2d8", "embed"]
+)
+@pytest.mark.parametrize("positionwise_layer_type", ["conv1d", "conv1d-linear"])
+@pytest.mark.parametrize(
+    "rel_pos_type, pos_enc_layer_type, selfattention_layer_type",
+    [
+        ("legacy", "abs_pos", "lf_selfattn"),
+    ],
+)
+def test_encoder_forward_backward(
+    input_layer,
+    positionwise_layer_type,
+    rel_pos_type,
+    pos_enc_layer_type,
+    selfattention_layer_type,
+):
+    pytest.importorskip("longformer")
+    encoder = LongformerEncoder(
+        20,
+        output_size=2,
+        attention_heads=2,
+        linear_units=4,
+        num_blocks=2,
+        input_layer=input_layer,
+        macaron_style=False,
+        rel_pos_type=rel_pos_type,
+        pos_enc_layer_type=pos_enc_layer_type,
+        selfattention_layer_type=selfattention_layer_type,
+        activation_type="swish",
+        use_cnn_module=True,
+        cnn_module_kernel=3,
+        positionwise_layer_type=positionwise_layer_type,
+        attention_windows=[10, 10],
+        attention_dilation=[1, 1],
+        attention_mode="sliding_chunks",
+    )
+    if input_layer == "embed":
+        x = torch.randint(0, 10, [2, 32])
+    else:
+        x = torch.randn(2, 32, 20, requires_grad=True)
+    x_lens = torch.LongTensor([32, 28])
+    y, _, _ = encoder(x, x_lens)
+    y.sum().backward()
+
+
+def test_encoder_invalid_layer_type():
+    pytest.importorskip("longformer")
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, pos_enc_layer_type="abc_pos")
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, pos_enc_layer_type="dummy")
+    with pytest.raises(ValueError):
+        LongformerEncoder(
+            20, pos_enc_layer_type="abc_pos", selfattention_layer_type="dummy"
+        )
+
+
+def test_encoder_invalid_windows_parameter():
+    pytest.importorskip("longformer")
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, attention_windows=[1, 1], num_blocks=4)
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, attention_dilation=[1, 1], num_blocks=4)
+
+
+def test_encoder_output_size():
+    pytest.importorskip("longformer")
+    encoder = LongformerEncoder(20, output_size=256)
+    assert encoder.output_size() == 256
+
+
+def test_encoder_invalid_type():
+    pytest.importorskip("longformer")
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, input_layer="fff")
diff --git a/tools/Makefile b/tools/Makefile
index 744a58b1bf7..c8c41bbb524 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -30,10 +30,10 @@ all: kaldi showenv python conda_packages.done sctk.done sph2pipe.done check_inst
 
 ifneq ($(strip $(CHAINER_VERSION)),)
 python: activate_python.sh espnet.done pytorch.done chainer.done fairscale.done torch_optimizer.done
-extra: warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq kenlm.done pyopenjtalk.done py3mmseg.done beamformit.done fairseq.done s3prl.done k2.done transformers.done phonemizer.done
+extra: warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq kenlm.done pyopenjtalk.done py3mmseg.done beamformit.done fairseq.done s3prl.done k2.done transformers.done phonemizer.done longformer.done
 else
 python: activate_python.sh espnet.done pytorch.done fairscale.done torch_optimizer.done
-extra: warp-ctc.done warp-transducer.done nkf.done moses.done mwerSegmenter.done pesq kenlm.done pyopenjtalk.done py3mmseg.done beamformit.done fairseq.done s3prl.done k2.done transformers.done phonemizer.done
+extra: warp-ctc.done warp-transducer.done nkf.done moses.done mwerSegmenter.done pesq kenlm.done pyopenjtalk.done py3mmseg.done beamformit.done fairseq.done s3prl.done k2.done transformers.done phonemizer.done longformer.done
 endif
 
 
@@ -205,6 +205,10 @@ transformers.done: espnet.done
 	. ./activate_python.sh && ./installers/install_transformers.sh
 	touch transformers.done
 
+longformer.done: espnet.done
+	. ./activate_python.sh && ./installers/install_longformer.sh
+	touch longformer.done
+
 check_install: python
 	. ./activate_python.sh; . ./extra_path.sh; python3 check_install.py
 
diff --git a/tools/check_install.py b/tools/check_install.py
index c5e4e6aa877..82081986123 100644
--- a/tools/check_install.py
+++ b/tools/check_install.py
@@ -29,6 +29,9 @@
     ("transformers", None, "installers/install_transformers.sh"),
     ("speechbrain", None, "installers/install_speechbrain.sh"),
     ("k2", None, "installers/install_k2.sh"),
+    ("longformer",None,"installers/install_longformer.sh"),
+    ("nlg-eval",None,"installers/install_longformer.sh"),
+    ("datasets",None,"installers/install_longformer.sh"),
 ]
 
 executable_list = [
diff --git a/tools/installers/install_longformer.sh b/tools/installers/install_longformer.sh
new file mode 100755
index 00000000000..a2f11eecf15
--- /dev/null
+++ b/tools/installers/install_longformer.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+if [ $# != 0 ]; then
+    echo "Usage: $0"
+    exit 1;
+fi
+
+torch_version=$(python3 -c "import torch; print(torch.__version__)")
+python_36_plus=$(python3 <<EOF
+from distutils.version import LooseVersion as V
+import sys
+
+if V(sys.version) >= V("3.6"):
+    print("true")
+else:
+    print("false")
+EOF
+)
+pt_plus(){
+    python3 <<EOF
+import sys
+from distutils.version import LooseVersion as L
+if L('$torch_version') >= L('$1'):
+    print("true")
+else:
+    print("false")
+EOF
+}
+
+echo "[INFO] torch_version=${torch_version}"
+
+if ! "${python_36_plus}"; then
+    echo "[ERROR] python<3.6 is not supported"
+    exit 1
+else
+
+    if $(pt_plus 1.6.1); then
+        pip install git+https://github.com/roshansh-cmu/longformer.git
+        pip install datasets bert-score
+	pip install git+https://github.com/Maluuba/nlg-eval.git@master 
+    else
+        echo "[WARNING] Longformer requires pytorch>=1.6.1"
+    fi
+
+fi
+
+
+# Check the pytorch version is not changed from the original version
+current_torch_version="$(python3 -c 'import torch; print(torch.__version__)')"
+if [ ${torch_version} != "${current_torch_version}" ]; then
+    echo "[ERROR] The torch version has been changed. Please report to espnet administrators"
+    exit 1
+fi
+