Merge branch 'espnet:master' into master

chintu619 · Mar 8, 2022 · 597cd7b · 597cd7b
2 parents 6625f90 + f16e579
commit 597cd7b
Show file tree

Hide file tree

Showing 72 changed files with 1,524 additions and 114 deletions.
diff --git a/ci/install.sh b/ci/install.sh
@@ -21,7 +21,7 @@ ${CXX:-g++} -v
     . ./activate_python.sh
     make TH_VERSION="${TH_VERSION}"
 
-    make warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done
+    make warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done longformer.done
     rm -rf kaldi
 )
 . tools/activate_python.sh

diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/score_summarization.py b/egs2/TEMPLATE/asr1/pyscripts/utils/score_summarization.py
@@ -0,0 +1,50 @@
+import sys
+import os
+from datasets import load_metric
+import numpy as np
+from nlgeval import compute_metrics
+from nlgeval import NLGEval
+
+
+ref_file = sys.argv[1]
+hyp_file = sys.argv[2]
+
+with open(ref_file, "r") as f:
+    ref_dict = {
+        line.strip().split(" ")[0]: " ".join(line.strip().split(" ")[1:])
+        for line in f.readlines()
+    }
+
+with open(hyp_file, "r") as f:
+    hyp_dict = {
+        line.strip().split(" ")[0]: " ".join(line.strip().split(" ")[1:])
+        for line in f.readlines()
+    }
+
+keys = [k for k, v in hyp_dict.items()]
+labels = [ref_dict[k] for k, _ in hyp_dict.items()]
+decoded_preds = [v for k, v in hyp_dict.items()]
+
+metric = load_metric("bertscore")
+result_bert = metric.compute(
+    predictions=decoded_preds,
+    references=labels,
+    lang="en",
+)
+
+
+nlg = NLGEval()  # loads the models
+print("Key", "\t", "METEOR", "\t", "ROUGE-L")
+for (key, ref, hyp) in zip(keys, labels, decoded_preds):
+    metrics_dict = nlg.compute_individual_metrics([ref], hyp)
+    print(key, "\t", metrics_dict["METEOR"], "\t", metrics_dict["ROUGE_L"])
+refs = [[x] for x in labels]
+metrics_dict = nlg.compute_metrics(ref_list=[labels], hyp_list=decoded_preds)
+metric = load_metric("rouge")
+result = metric.compute(predictions=decoded_preds, references=labels)
+result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
+
+print(
+    f"RESULT {result['rouge1']} {result['rouge2']} {result['rougeL']} \
+    {metrics_dict['METEOR']*100.0} {100*np.mean(result_bert['precision'])}"
+)
diff --git a/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh b/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh
@@ -44,7 +44,16 @@ cat << EOF
 EOF
 
 while IFS= read -r expdir; do
-    if ls "${expdir}"/*/*/score_*/result.txt &> /dev/null; then
+
+      if ls "${expdir}"/*/*/result.sum &> /dev/null; then
+	echo "## $(basename ${expdir})"
+	cat << EOF
+|dataset|ROUGE-1|ROUGE-2|ROUGE-L|METEOR|BERTScore|
+|---|---|---|---|---|---|
+EOF
+	grep -H -e "RESULT" "${expdir}"/*/*/result.sum | sed 's=RESULT==g' |  cut -d ' ' -f 1,2- | tr ' ' '|'
+	echo  
+      elif ls "${expdir}"/*/*/score_*/result.txt &> /dev/null; then
         echo "## $(basename ${expdir})"
         for type in wer cer ter; do
                 	cat << EOF

diff --git a/egs2/how2/asr1/cmd.sh b/egs2/how2/asr1/cmd.sh
diff --git a/egs2/how2/asr1/cmd.sh b/egs2/how2/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
diff --git a/egs2/how2_2000h/asr1/README.md b/egs2/how2_2000h/asr1/README.md
@@ -0,0 +1,30 @@
+## End to End Speech Recognition
+
+This recipe can be used to build E2E Speech Summarization models using restricted self-attention on the HowTo corpus of instructional videos. 
+
+HowTo 2000h fbank-pitch features have been released to enable reproduction of this recipe. 
+
+#Results on ASR
+
+
+## asr_base_conformer_lf_mix
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|55215|93.1|4.8|2.1|1.9|8.8|56.7|
+|decode_asr_model_valid.acc.best/held_out_test|2761|47348|92.7|5.0|2.3|2.2|9.5|54.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|276377|97.1|1.1|1.9|1.9|4.8|56.7|
+|decode_asr_model_valid.acc.best/held_out_test|2761|236575|96.8|1.2|2.0|2.1|5.4|54.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|82484|94.1|3.5|2.4|2.2|8.0|56.7|
+|decode_asr_model_valid.acc.best/held_out_test|2761|70264|93.9|3.7|2.4|2.7|8.9|54.6|
diff --git a/egs2/how2_2000h/asr1/asr.sh b/egs2/how2_2000h/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
diff --git a/egs2/how2_2000h/asr1/cmd.sh b/egs2/how2_2000h/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
diff --git a/egs2/how2_2000h/asr1/conf/decode_asr.yaml b/egs2/how2_2000h/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_ctc.yaml
diff --git a/egs2/how2_2000h/asr1/conf/fbank.conf b/egs2/how2_2000h/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/how2_2000h/asr1/conf/pbs.conf b/egs2/how2_2000h/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/how2_2000h/asr1/conf/pitch.conf b/egs2/how2_2000h/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/how2_2000h/asr1/conf/queue.conf b/egs2/how2_2000h/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/how2_2000h/asr1/conf/slurm.conf b/egs2/how2_2000h/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/how2_2000h/asr1/conf/train_asr_conformer_lf.yaml b/egs2/how2_2000h/asr1/conf/train_asr_conformer_lf.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_vid_ctc_lf.yaml
diff --git a/egs2/how2_2000h/asr1/conf/tuning/decode.yaml b/egs2/how2_2000h/asr1/conf/tuning/decode.yaml
@@ -0,0 +1,7 @@
+beam_size: 4
+batch_size: 1
+penalty: 0.0
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 0.3
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/asr1/conf/tuning/decode_ctc.yaml b/egs2/how2_2000h/asr1/conf/tuning/decode_ctc.yaml
@@ -0,0 +1,7 @@
+beam_size: 20
+batch_size: 1
+penalty: 0.1
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 1.0
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/asr1/conf/tuning/pretrain_hubert.yaml b/egs2/how2_2000h/asr1/conf/tuning/pretrain_hubert.yaml
@@ -0,0 +1,58 @@
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 150000000
+accum_grad: 1
+max_epoch: 400
+patience: none
+# Use self-defined function for initialization
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+input_size: 768
+encoder: avhubert_pretrain
+encoder_conf:
+    output_size: 768
+    linear_units: 3072
+    attention_heads: 8
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    dropout_input: 0.1
+    dropout_features: 0.1
+    skip_masked: false
+    skip_nomask: false
+    mask_prob: 0.80
+    extractor_mode: default
+    conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+    final_dim: 256
+    encoder_layerdrop: 0.05
+    feature_grad_mult: 0.1
+    untie_final_proj: true
+    label_rate: 100
+    sample_rate: 16000
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    pred_masked_weight: 1.0
+    pred_nomask_weight: 0.0
+    loss_weights: 10.0
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+unused_parameters: true
+
+frontend: null
+
+normalize: null
+
+specaug: null