diff --git a/ci/test_integration.sh b/ci/test_integration.sh index cdcc1c442ca..5bbd7aa6224 100755 --- a/ci/test_integration.sh +++ b/ci/test_integration.sh @@ -31,6 +31,9 @@ echo "=== ASR (backend=pytorch, model=rnn-no-ctc) ===" echo "=== ASR (backend=pytorch, model=transformer) ===" ./run.sh --stage 4 --train-config conf/train_transformer.yaml \ --decode-config conf/decode.yaml +echo "=== ASR (backend=pytorch, model=conformer) ===" +./run.sh --stage 4 --train-config conf/train_conformer.yaml \ + --decode-config conf/decode.yaml echo "=== ASR (backend=pytorch, model=transformer-pure-ctc) ===" ./run.sh --stage 4 --train-config conf/train_transformer_pure_ctc.yaml \ --decode-config conf/decode_pure_ctc.yaml @@ -94,7 +97,7 @@ done for t in ${feats_types}; do for t2 in ${token_types}; do echo "==== feats_type=${t}, token_types=${t2} ===" - ./run.sh --ngpu 0 --stage 6 --stop-stage 100 --feats-type "${t}" --token-type "${t2}" \ + ./run.sh --ngpu 0 --stage 6 --stop-stage 13 --feats-type "${t}" --token-type "${t2}" \ --asr-args "--max_epoch=1" --lm-args "--max_epoch=1" done done @@ -109,7 +112,7 @@ echo "==== [ESPnet2] TTS ===" feats_types="raw fbank stft" for t in ${feats_types}; do echo "==== feats_type=${t} ===" - ./run.sh --ngpu 0 --stage 2 --stop-stage 100 --feats-type "${t}" --train-args "--max_epoch 1" + ./run.sh --ngpu 0 --stage 2 --stop-stage 8 --feats-type "${t}" --train-args "--max_epoch 1" done # Remove generated files in order to reduce the disk usage rm -rf exp dump data diff --git a/egs/README.md b/egs/README.md index 7375448fc4d..153a1535926 100644 --- a/egs/README.md +++ b/egs/README.md @@ -59,4 +59,5 @@ See: https://espnet.github.io/espnet/tutorial.html | voxforge | VoxForge | ASR | 7 languages | http://www.voxforge.org/ | | | wsj | CSR-I (WSJ0) Complete, CSR-II (WSJ1) Complete | ASR | EN | https://catalog.ldc.upenn.edu/LDC93S6A,https://catalog.ldc.upenn.edu/LDC94S13A | | | wsj_mix | MERL WSJ0-mix multi-speaker dataset | Multispeaker ASR | EN | http://www.merl.com/demos/deep-clustering | | -| yesno | The "yesno" corpus | ASR | HE | http://www.openslr.org/1 | | +| yesno | The "yesno" corpus | ASR | HE | http://www.openslr.org/1 | +| Yoloxóchitl-Mixtec | The Yoloxóchitl-Mixtec corpus | ASR | Mixtec | http://www.openslr.org/89 || diff --git a/egs/aishell/asr1/RESULTS.md b/egs/aishell/asr1/RESULTS.md index 91d799cdc10..2d0ebb77f98 100644 --- a/egs/aishell/asr1/RESULTS.md +++ b/egs/aishell/asr1/RESULTS.md @@ -1,3 +1,16 @@ +# Conformer result + +- training config file: `conf/tuning/train_pytorch_conformer.yaml` +- decoding config file: `conf/decode.yaml` +``` +exp/train_sp_pytorch_train_pytorch_conformer/decode_dev_decode_pytorch_conformer/result.txt +| SPKR | # Snt # Wrd | Corr Sub Del Ins Err S.Err | +| Sum/Avg | 14326 205341 | 94.7 5.1 0.1 0.1 5.4 39.0 | +exp/train_sp_pytorch_train_pytorch_conformer/decode_test_decode_pytorch_conformer/result.txt +| SPKR | # Snt # Wrd | Corr Sub Del Ins Err S.Err | +| Sum/Avg | 7176 104765 | 94.2 5.6 0.2 0.1 5.9 41.8 | +``` + # Transformer result (default transformer with initial learning rate = 1.0 and epochs = 50) - Environments (obtained by `$ get_sys_info.sh`) @@ -64,3 +77,4 @@ exp/train_sp_pytorch_train_pytorch_transformer_lr1.0/decode_test_decode_pytorch_ | SPKR | # Snt # Wrd | Corr Sub Del Ins Err S.Err | | Sum/Avg | 7176 104765 | 92.7 7.1 0.2 0.1 7.4 49.8 | ``` + diff --git a/egs/aishell/asr1/conf/tuning/train_pytorch_conformer.yaml b/egs/aishell/asr1/conf/tuning/train_pytorch_conformer.yaml new file mode 100644 index 00000000000..50d44abb5ab --- /dev/null +++ b/egs/aishell/asr1/conf/tuning/train_pytorch_conformer.yaml @@ -0,0 +1,47 @@ +# network architecture +# encoder related +elayers: 12 +eunits: 2048 +# decoder related +dlayers: 6 +dunits: 2048 +# attention related +adim: 256 +aheads: 4 + +# hybrid CTC/attention +mtlalpha: 0.3 + +# label smoothing +lsm-weight: 0.1 + +# minibatch related +batch-size: 32 +maxlen-in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced + +# optimization related +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +opt: noam +accum-grad: 2 +grad-clip: 5 +patience: 0 +epochs: 50 +dropout-rate: 0.1 + +# transformer specific setting +backend: pytorch +model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E" +transformer-input-layer: conv2d # encoder architecture type +transformer-lr: 1.0 +transformer-warmup-steps: 25000 +transformer-attn-dropout-rate: 0.0 +transformer-length-normalized-loss: false +transformer-init: pytorch + +# conformer specific setting +transformer-encoder-pos-enc-layer-type: rel_pos +transformer-encoder-selfattn-layer-type: rel_selfattn +macaron-style: true +use-cnn-module: true +cnn-module-kernel: 31 diff --git a/egs/mini_an4/asr1/conf/train_conformer.yaml b/egs/mini_an4/asr1/conf/train_conformer.yaml new file mode 100644 index 00000000000..2ed64ae935d --- /dev/null +++ b/egs/mini_an4/asr1/conf/train_conformer.yaml @@ -0,0 +1,47 @@ +# network architecture +# encoder related +elayers: 2 +eunits: 32 +# decoder related +dlayers: 2 +dunits: 32 +# attention related +adim: 16 +aheads: 4 + +# hybrid CTC/attention +mtlalpha: 0.3 + +# label smoothing +lsm-weight: 0.1 + +# minibatch related +batch-size: 2 +maxlen-in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced + +# optimization related +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +opt: noam +accum-grad: 2 +grad-clip: 5 +patience: 0 +epochs: 3 +dropout-rate: 0.1 + +# transformer specific setting +backend: pytorch +model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E" +transformer-input-layer: conv2d # encoder architecture type +transformer-lr: 5.0 +transformer-warmup-steps: 25000 +transformer-attn-dropout-rate: 0.0 +transformer-length-normalized-loss: false +transformer-init: pytorch + +# conformer specific setting +transformer-encoder-pos-enc-layer-type: rel_pos +transformer-encoder-selfattn-layer-type: rel_selfattn +macaron-style: true +use-cnn-module: true +cnn-module-kernel: 31 diff --git a/egs/wsj/asr1/local/filtering_samples.py b/egs/wsj/asr1/local/filtering_samples.py index d2ad3dff7cb..ef421e39279 100755 --- a/egs/wsj/asr1/local/filtering_samples.py +++ b/egs/wsj/asr1/local/filtering_samples.py @@ -50,7 +50,7 @@ args = parser.parse_args(cmd_args) # subsampling info - if args.etype.startswith("vgg"): + if hasattr(args, "etype") and args.etype.startswith("vgg"): # Subsampling is not performed for vgg*. # It is performed in max pooling layers at CNN. min_io_ratio = 4 diff --git a/egs/yoloxochitl_mixtec/asr1/RESULTS.md b/egs/yoloxochitl_mixtec/asr1/RESULTS.md new file mode 100644 index 00000000000..a5329bc7704 --- /dev/null +++ b/egs/yoloxochitl_mixtec/asr1/RESULTS.md @@ -0,0 +1,36 @@ +# RESULTS (100 epoch using single GPU) +## Environments +- date: `Thu Jun 25 23:13:00 EDT 2020` +- python version: `3.7.3 (default, Mar 27 2019, 22:11:17) [GCC 7.3.0]` +- espnet version: `espnet 0.5.2` +- chainer version: `chainer 6.0.0` +- pytorch version: `pytorch 1.1.0` + +## Pre-trained Model +- Model files (archived to model.tar.gz by `$ pack_model.sh`) + - model link: https://drive.google.com/file/d/1daXJp3mpvOKYYuEcgNbIDRyp16Q0gjFg/view?usp=sharing + - training config file: `conf/train.yaml` + - decoding config file: `conf/decode.yaml` + - cmvn file: `data/train_mixtec_surface_reserve/cmvn.ark` + - e2e file: `exp/train_mixtec_surface_reserve_pytorch_mixtec_surface_reserve/results/model.last10.avg.best` + - e2e JSON file: `exp/train_mixtec_surface_reserve_pytorch_mixtec_surface_reserve/results/model.json` + - lm file: `exp/train_rnnlm_pytorch_mixtec_surface_reserve_unigram150/rnnlm.model.best` + - lm JSON file: `exp/train_rnnlm_pytorch_mixtec_surface_reserve_unigram150/model.json` + - dict file: `data/lang_char` + + +## train_mixtec_surface_reserve_pytorch_mixtec_surface_reserve +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_dev_mixtec_surface_reserve_decode_mixtec_surface_reserve|10218|687420|89.6|6.0|4.5|2.7|13.2|87.8| +|decode_test_mixtec_surface_reserve_decode_mixtec_surface_reserve|10112|688918|89.7|5.9|4.4|2.7|13.0|87.9| + +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_dev_mixtec_surface_reserve_decode_mixtec_surface_reserve|10218|165748|80.3|15.6|4.1|3.2|22.9|87.8| +|decode_test_mixtec_surface_reserve_decode_mixtec_surface_reserve|10112|166168|80.5|15.5|4.1|3.2|22.7|87.9| + diff --git a/egs/yoloxochitl_mixtec/asr1/cmd.sh b/egs/yoloxochitl_mixtec/asr1/cmd.sh new file mode 100644 index 00000000000..4d70c9c7a79 --- /dev/null +++ b/egs/yoloxochitl_mixtec/asr1/cmd.sh @@ -0,0 +1,89 @@ +# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== +# Usage: .pl [options] JOB=1: +# e.g. +# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB +# +# Options: +# --time