Merge pull request espnet#3063 from sw005320/iwslt21_asr

added results and uploaded models
chintu619 · Mar 10, 2021 · a3819a5 · a3819a5
2 parents 1bdb752 + 03ae696
commit a3819a5
Show file tree

Hide file tree

Showing 6 changed files with 133 additions and 10 deletions.
diff --git a/egs/iwslt21/asr1/RESULTS.md b/egs/iwslt21/asr1/RESULTS.md
@@ -0,0 +1,60 @@
+# RESULTS
+## Environments
+- date: `Tue Mar  9 09:50:14 EST 2021`
+- python version: `3.8.5 (default, Sep  4 2020, 07:30:14)  [GCC 7.3.0]`
+- espnet version: `espnet 0.9.8`
+- chainer version: `chainer 6.0.0`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `99d89903e42013dda5c5bc08bcf37a529eab7eb7`
+  - Commit date: `Tue Mar 9 08:58:35 2021 -0500`
+
+## train_pytorch_train_pytorch_conformer_large_mustc_like_bpe5000_specaug
+  - Model files (archived to model.mustc_like.tar.gz by `$ pack_model.sh`)
+    - model link: https://drive.google.com/file/d/107ujDaIrlj6tFHiWLNP6aUBuV0PVyX_Y/view?usp=sharing
+    - training config file: `conf/tuning/train_pytorch_conformer_large_mustc_like.yaml`
+    - decoding config file: `conf/tuning/decode_pytorch_transformer.yaml`
+    - cmvn file: `data/train/cmvn.ark`
+    - e2e file: `exp/train_pytorch_train_pytorch_conformer_large_mustc_like_bpe5000_specaug/results/model.val5.avg.best`
+    - e2e JSON file: `exp/train_pytorch_train_pytorch_conformer_large_mustc_like_bpe5000_specaug/results/model.json`
+    - dict file: `data/lang_1spm`
+  - No LM. 4 GPU training
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_et_librispeech_test_other_decode|2939|71179|92.4|5.7|1.8|1.1|8.7|56.5|
+|decode_et_mustc_tst-COMMON_decode|2641|58047|94.7|2.8|2.6|1.1|6.4|36.6|
+|decode_et_tedlium2_test_decode|1155|33696|94.1|2.7|3.2|1.2|7.2|56.4|
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_et_librispeech_test_other_decode|2939|53022|93.3|6.0|0.7|0.8|7.5|56.4|
+|decode_et_mustc_tst-COMMON_decode|2641|47335|95.2|2.9|1.8|1.1|5.8|36.6|
+|decode_et_tedlium2_test_decode|1155|27500|94.0|3.0|3.0|1.2|7.2|56.3|
+
+## train_pytorch_train_pytorch_conformer_large_librispeech_like_bpe5000_specaug
+  - Model files (archived to model.librispeech_like.tar.gz by `$ pack_model.sh`)
+    - model link: https://drive.google.com/file/d/1C2iZQu4P5RKxWAjpD-ZkJZcHIg2-ED51/view?usp=sharing
+    - training config file: `conf/tuning/train_pytorch_conformer_large_librispeech_like.yaml`
+    - decoding config file: `conf/tuning/decode_pytorch_transformer.yaml`
+    - cmvn file: `data/train/cmvn.ark`
+    - e2e file: `exp/train_pytorch_train_pytorch_conformer_large_librispeech_like_bpe5000_specaug/results/model.val5.avg.best`
+    - e2e JSON file: `exp/train_pytorch_train_pytorch_conformer_large_librispeech_like_bpe5000_specaug/results/model.json`
+    - dict file: `data/lang_1spm`
+  - No LM. 4 GPU training
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_et_librispeech_test_other_decode|2939|71179|92.7|5.5|1.8|1.0|8.3|54.0|
+|decode_et_mustc_tst-COMMON_decode|2641|58047|94.8|2.6|2.6|1.0|6.2|37.0|
+|decode_et_tedlium2_test_decode|1155|33696|94.9|2.4|2.6|1.1|6.2|54.3|
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_et_librispeech_test_other_decode|2939|53022|93.7|5.6|0.7|0.8|7.1|53.8|
+|decode_et_mustc_tst-COMMON_decode|2641|47335|95.4|2.7|1.8|1.0|5.6|37.0|
+|decode_et_tedlium2_test_decode|1155|27500|94.8|2.6|2.5|1.0|6.2|54.3|
diff --git a/egs/iwslt21/asr1/conf/train.yaml b/egs/iwslt21/asr1/conf/train.yaml
@@ -1 +1 @@
-tuning/train_pytorch_transformer_large.yaml
+tuning/train_pytorch_conformer_large_librispeech_like.yaml
diff --git a/egs/iwslt21/asr1/conf/tuning/decode_pytorch_transformer.yaml b/egs/iwslt21/asr1/conf/tuning/decode_pytorch_transformer.yaml
@@ -4,4 +4,4 @@ penalty: 0.0
 maxlenratio: 0.0
 minlenratio: 0.0
 ctc-weight: 0.5
-lm-weight: 0.7
+lm-weight: 0.0
diff --git a/...ning/train_pytorch_transformer_large.yaml → ...rch_conformer_large_librispeech_like.yaml b/...ning/train_pytorch_transformer_large.yaml → ...rch_conformer_large_librispeech_like.yaml
@@ -16,25 +16,25 @@ mtlalpha: 0.3
 lsm-weight: 0.1
 
 # minibatch related
-batch-size: 32
+batch-size: 50
 maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
 maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+#batch-bins: 15000000
 
 # optimization related
 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
 opt: noam
-accum-grad: 4
+accum-grad: 4  # worth tuning!
 grad-clip: 5
 patience: 0
 epochs: 30
 dropout-rate: 0.1
 
 # transformer specific setting
 backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
 transformer-input-layer: conv2d     # encoder architecture type
-# transformer-lr: 10.0
-transformer-lr: 5.0
+transformer-lr: 10.0  # worth tuning!
 transformer-warmup-steps: 25000
 transformer-attn-dropout-rate: 0.0
 transformer-length-normalized-loss: false
@@ -43,3 +43,12 @@ transformer-init: pytorch
 # Report CER & WER
 report-cer: true
 report-wer: true
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+rel-pos-type: latest
+transformer-encoder-activation-type: swish
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 31  # worth tuning!
diff --git a/egs/iwslt21/asr1/conf/tuning/train_pytorch_conformer_large_mustc_like.yaml b/egs/iwslt21/asr1/conf/tuning/train_pytorch_conformer_large_mustc_like.yaml
@@ -0,0 +1,54 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 512
+aheads: 8
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 50  # worth tuning!
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+#batch-bins: 15000000
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2  # worth tuning!
+grad-clip: 5
+patience: 0
+epochs: 30
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 2.0  # worth tuning!
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# Report CER & WER
+report-cer: true
+report-wer: true
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+rel-pos-type: latest
+transformer-encoder-activation-type: swish
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 15  # worth tuning!
diff --git a/egs/iwslt21/asr1/run.sh b/egs/iwslt21/asr1/run.sh
@@ -12,7 +12,7 @@ stage=0         # start from -1 if you need to start from data download
 stop_stage=100
 ngpu=1          # number of gpus during training ("0" uses cpu, otherwise use gpu)
 dec_ngpu=0      # number of gpus during decoding ("0" uses cpu, otherwise use gpu)
-nj=8            # number of parallel jobs for decoding
+nj=32           # number of parallel jobs for decoding
 debugmode=1
 dumpdir=dump    # directory to dump full features
 N=0             # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
@@ -302,6 +302,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
         asr_train.py \
         --config ${train_config} \
+        --n-iter-processes 3 \
         --preprocess-conf ${preprocess_config} \
         --ngpu ${ngpu} \
         --backend ${backend} \
@@ -361,8 +362,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --batchsize 0 \
             --recog-json ${feat_recog_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
             --result-label ${expdir}/${decode_dir}/data.JOB.json \
-            --model ${expdir}/results/${recog_model} \
-            --rnnlm ${lmexpdir}/rnnlm.model.best
+            --model ${expdir}/results/${recog_model}
 
         score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
     ) &
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		tuning/train_pytorch_transformer_large.yaml
		tuning/train_pytorch_conformer_large_librispeech_like.yaml