Merge branch 'master' of https://github.com/espnet/espnet

chintu619 · Jul 22, 2020 · 16368ea · 16368ea
2 parents 0bc48ac + 5c86f39
commit 16368ea
Show file tree

Hide file tree

Showing 150 changed files with 7,627 additions and 1,579 deletions.
diff --git a/ci/test_integration.sh b/ci/test_integration.sh
@@ -31,6 +31,9 @@ echo "=== ASR (backend=pytorch, model=rnn-no-ctc) ==="
 echo "=== ASR (backend=pytorch, model=transformer) ==="
 ./run.sh --stage 4 --train-config conf/train_transformer.yaml \
          --decode-config conf/decode.yaml
+echo "=== ASR (backend=pytorch, model=conformer) ==="
+./run.sh --stage 4 --train-config conf/train_conformer.yaml \
+         --decode-config conf/decode.yaml
 echo "=== ASR (backend=pytorch, model=transformer-pure-ctc) ==="
 ./run.sh --stage 4 --train-config conf/train_transformer_pure_ctc.yaml \
        --decode-config conf/decode_pure_ctc.yaml
@@ -94,7 +97,7 @@ done
 for t in ${feats_types}; do
     for t2 in ${token_types}; do
         echo "==== feats_type=${t}, token_types=${t2} ==="
-        ./run.sh --ngpu 0 --stage 6 --stop-stage 100 --feats-type "${t}" --token-type "${t2}" \
+        ./run.sh --ngpu 0 --stage 6 --stop-stage 13 --feats-type "${t}" --token-type "${t2}" \
             --asr-args "--max_epoch=1" --lm-args "--max_epoch=1"
     done
 done
@@ -109,7 +112,7 @@ echo "==== [ESPnet2] TTS ==="
 feats_types="raw fbank stft"
 for t in ${feats_types}; do
     echo "==== feats_type=${t} ==="
-    ./run.sh --ngpu 0 --stage 2 --stop-stage 100 --feats-type "${t}" --train-args "--max_epoch 1"
+    ./run.sh --ngpu 0 --stage 2 --stop-stage 8 --feats-type "${t}" --train-args "--max_epoch 1"
 done
 # Remove generated files in order to reduce the disk usage
 rm -rf exp dump data

diff --git a/egs/README.md b/egs/README.md
@@ -59,4 +59,5 @@ See: https://espnet.github.io/espnet/tutorial.html
 | voxforge                | VoxForge                                                     | ASR                                        | 7 languages    | http://www.voxforge.org/                                     |                               |
 | wsj                     | CSR-I (WSJ0) Complete, CSR-II (WSJ1) Complete                | ASR                                        | EN             | https://catalog.ldc.upenn.edu/LDC93S6A,https://catalog.ldc.upenn.edu/LDC94S13A |                               |
 | wsj_mix                 | MERL WSJ0-mix multi-speaker dataset                          | Multispeaker ASR                           | EN             | http://www.merl.com/demos/deep-clustering                    |                               |
-| yesno                   | The "yesno" corpus                                           | ASR                                        | HE             | http://www.openslr.org/1                                     |                               |
+| yesno                   | The "yesno" corpus                                           | ASR                                        | HE             | http://www.openslr.org/1                                     |                               
+| Yoloxóchitl-Mixtec      | The Yoloxóchitl-Mixtec corpus                                | ASR                                        | Mixtec         | http://www.openslr.org/89                                    ||
diff --git a/egs/aishell/asr1/RESULTS.md b/egs/aishell/asr1/RESULTS.md
@@ -1,3 +1,16 @@
+# Conformer result
+
+- training config file: `conf/tuning/train_pytorch_conformer.yaml`
+- decoding config file: `conf/decode.yaml`
+```
+exp/train_sp_pytorch_train_pytorch_conformer/decode_dev_decode_pytorch_conformer/result.txt
+|   SPKR     |   # Snt      # Wrd   |   Corr        Sub        Del        Ins        Err      S.Err   |
+|   Sum/Avg  |  14326      205341   |   94.7        5.1        0.1        0.1        5.4       39.0   |
+exp/train_sp_pytorch_train_pytorch_conformer/decode_test_decode_pytorch_conformer/result.txt
+|   SPKR     |   # Snt      # Wrd   |   Corr        Sub         Del        Ins        Err      S.Err   |
+|   Sum/Avg  |   7176      104765   |   94.2        5.6         0.2        0.1        5.9       41.8   |
+```
+
 # Transformer result (default transformer with initial learning rate = 1.0 and epochs = 50)
 
   - Environments (obtained by `$ get_sys_info.sh`)
@@ -64,3 +77,4 @@ exp/train_sp_pytorch_train_pytorch_transformer_lr1.0/decode_test_decode_pytorch_
 |   SPKR     |   # Snt      # Wrd   |   Corr        Sub        Del        Ins        Err      S.Err   |
 |   Sum/Avg  |   7176       104765  |   92.7        7.1        0.2        0.1        7.4      49.8    |
 ```
+
diff --git a/egs/aishell/asr1/conf/tuning/train_pytorch_conformer.yaml b/egs/aishell/asr1/conf/tuning/train_pytorch_conformer.yaml
@@ -0,0 +1,47 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 50
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 1.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 31
diff --git a/egs/mini_an4/asr1/conf/train_conformer.yaml b/egs/mini_an4/asr1/conf/train_conformer.yaml
@@ -0,0 +1,47 @@
+# network architecture
+# encoder related
+elayers: 2
+eunits: 32
+# decoder related
+dlayers: 2
+dunits: 32
+# attention related
+adim: 16
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 2
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 3
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 5.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 31
diff --git a/egs/wsj/asr1/local/filtering_samples.py b/egs/wsj/asr1/local/filtering_samples.py
@@ -50,7 +50,7 @@
     args = parser.parse_args(cmd_args)
 
     # subsampling info
-    if args.etype.startswith("vgg"):
+    if hasattr(args, "etype") and args.etype.startswith("vgg"):
         # Subsampling is not performed for vgg*.
         # It is performed in max pooling layers at CNN.
         min_io_ratio = 4

diff --git a/egs/yoloxochitl_mixtec/asr1/RESULTS.md b/egs/yoloxochitl_mixtec/asr1/RESULTS.md
@@ -0,0 +1,36 @@
+# RESULTS (100 epoch using single GPU)
+## Environments
+- date: `Thu Jun 25 23:13:00 EDT 2020`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.5.2`
+- chainer version: `chainer 6.0.0`
+- pytorch version: `pytorch 1.1.0`
+
+## Pre-trained Model
+- Model files (archived to model.tar.gz by `$ pack_model.sh`)
+  - model link: https://drive.google.com/file/d/1daXJp3mpvOKYYuEcgNbIDRyp16Q0gjFg/view?usp=sharing
+  - training config file: `conf/train.yaml`
+  - decoding config file: `conf/decode.yaml`
+  - cmvn file: `data/train_mixtec_surface_reserve/cmvn.ark`
+  - e2e file: `exp/train_mixtec_surface_reserve_pytorch_mixtec_surface_reserve/results/model.last10.avg.best`
+  - e2e JSON file: `exp/train_mixtec_surface_reserve_pytorch_mixtec_surface_reserve/results/model.json`
+  - lm file: `exp/train_rnnlm_pytorch_mixtec_surface_reserve_unigram150/rnnlm.model.best`
+  - lm JSON file: `exp/train_rnnlm_pytorch_mixtec_surface_reserve_unigram150/model.json`
+  - dict file: `data/lang_char`
+
+
+## train_mixtec_surface_reserve_pytorch_mixtec_surface_reserve
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_dev_mixtec_surface_reserve_decode_mixtec_surface_reserve|10218|687420|89.6|6.0|4.5|2.7|13.2|87.8|
+|decode_test_mixtec_surface_reserve_decode_mixtec_surface_reserve|10112|688918|89.7|5.9|4.4|2.7|13.0|87.9|
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_dev_mixtec_surface_reserve_decode_mixtec_surface_reserve|10218|165748|80.3|15.6|4.1|3.2|22.9|87.8|
+|decode_test_mixtec_surface_reserve_decode_mixtec_surface_reserve|10112|166168|80.5|15.5|4.1|3.2|22.7|87.9|
+
diff --git a/egs/yoloxochitl_mixtec/asr1/cmd.sh b/egs/yoloxochitl_mixtec/asr1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/decode.yaml b/egs/yoloxochitl_mixtec/asr1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_pytorch_transformer.yaml
diff --git a/egs2/TEMPLATE/tts1/conf/fbank.conf → egs/yoloxochitl_mixtec/asr1/conf/fbank.conf b/egs2/TEMPLATE/tts1/conf/fbank.conf → egs/yoloxochitl_mixtec/asr1/conf/fbank.conf
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/gpu.conf b/egs/yoloxochitl_mixtec/asr1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/lm.yaml b/egs/yoloxochitl_mixtec/asr1/conf/lm.yaml
@@ -0,0 +1,8 @@
+layer: 2
+unit: 650
+opt: sgd        # or adam
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+batchsize: 64   # batch size in LM training
+epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+maxlen: 100     # if sentence length > lm_maxlen, lm_batchsize is automatically reduced
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/pitch.conf b/egs/yoloxochitl_mixtec/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/queue.conf b/egs/yoloxochitl_mixtec/asr1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/slurm.conf b/egs/yoloxochitl_mixtec/asr1/conf/slurm.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/train.yaml b/egs/yoloxochitl_mixtec/asr1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_pytorch_transformer.yaml
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/tuning/decode_pytorch_transformer.yaml b/egs/yoloxochitl_mixtec/asr1/conf/tuning/decode_pytorch_transformer.yaml
@@ -0,0 +1,7 @@
+batchsize: 0
+beam-size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.5
+lm-weight: 0.3
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/tuning/decode_rnn.yaml b/egs/yoloxochitl_mixtec/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm-weight: 0.3
+beam-size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.6
diff --git a/egs/yoloxochitl_mixtec/asr1/conf/tuning/train_pytorch_transformer.yaml b/egs/yoloxochitl_mixtec/asr1/conf/tuning/train_pytorch_transformer.yaml
@@ -0,0 +1,40 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 100
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 1.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch