Merge branch 'espnet:master' into master

chintu619 · Mar 24, 2022 · aa706c5 · aa706c5
2 parents ab2fa25 + 350af36
commit aa706c5
Show file tree

Hide file tree

Showing 111 changed files with 5,311 additions and 95 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "doc/notebook"]
-	path = doc/notebook
-	url = https://github.com/espnet/notebook

diff --git a/README.md b/README.md
@@ -133,7 +133,7 @@ To train the neural vocoder, please check the following repositories:
 - Multi-speaker speech separation
 - Unified encoder-separator-decoder structure for time-domain and frequency-domain models
   - Encoder/Decoder: STFT/iSTFT, Convolution/Transposed-Convolution
-  - Separators: BLSTM, Transformer, Conformer, DPRNN, [DCCRN](https://arxiv.org/abs/2008.00264), Neural Beamformers, etc.
+  - Separators: BLSTM, Transformer, Conformer, [TasNet](https://arxiv.org/abs/1809.07454), [DPRNN](https://arxiv.org/abs/1910.06379), [DC-CRN](https://web.cse.ohio-state.edu/~wang.77/papers/TZW.taslp21.pdf), [DCCRN](https://arxiv.org/abs/2008.00264), Neural Beamformers, etc.
 - Flexible ASR integration: working as an individual task or as the ASR frontend
 - Easy to import pretrained models from [Asteroid](https://github.com/asteroid-team/asteroid)
   - Both the pre-trained models from Asteroid and the specific configuration are supported.

diff --git a/ci/doc.sh b/ci/doc.sh
@@ -26,6 +26,8 @@ set -euo pipefail
 find ./utils/{*.sh,spm_*} -exec ./doc/usage2rst.sh {} \; | tee ./doc/_gen/utils_sh.rst
 find ./espnet2/bin/*.py -exec ./doc/usage2rst.sh {} \; | tee ./doc/_gen/espnet2_bin.rst
 
+./doc/notebook2rst.sh > ./doc/_gen/notebooks.rst
+
 # generate package doc
 ./doc/module2rst.py --root espnet espnet2 --dst ./doc --exclude espnet.bin
 

diff --git a/doc/.gitignore b/doc/.gitignore
@@ -1,4 +1,4 @@
 _gen/
 _build/
 build/
-
+notebook/
diff --git a/doc/index.rst b/doc/index.rst
@@ -28,16 +28,7 @@ ESPnet is an end-to-end speech processing toolkit, mainly focuses on end-to-end
    ./espnet2_task.md
    ./espnet2_distributed.md
 
-.. toctree::
-   :maxdepth: 1
-   :caption: Notebook:
-
-   ./notebook/asr_cli.ipynb
-   ./notebook/asr_library.ipynb
-   ./notebook/tts_cli.ipynb
-   ./notebook/pretrained.ipynb
-   ./notebook/tts_realtime_demo.ipynb
-   ./notebook/st_demo.ipynb
+.. include:: ./_gen/notebooks.rst
 
 .. include:: ./_gen/modules.rst
 

diff --git a/doc/installation.md b/doc/installation.md
@@ -32,14 +32,14 @@ the following packages are installed using Anaconda, so you can skip them.)
     # For CentOS
     $ sudo yum install libsndfile
     ```
-- ffmpeg (This is not required when installataion, but used in some recipes)
+- ffmpeg (This is not required when installing, but used in some recipes)
     ```sh
     # For Ubuntu
     $ sudo apt-get install ffmpeg
     # For CentOS
     $ sudo yum install ffmpeg
     ```
-- flac (This is not required when installataion, but used in some recipes)
+- flac (This is not required when installing, but used in some recipes)
     ```sh
     # For Ubuntu
     $ sudo apt-get install flac

diff --git a/doc/notebook b/doc/notebook
diff --git a/doc/notebook2rst.sh b/doc/notebook2rst.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+cd "$(dirname "$0")"
+
+if [ ! -d notebook ]; then
+    git clone https://github.com/espnet/notebook --depth 1
+fi
+
+echo "\
+.. toctree::
+   :maxdepth: 1
+   :caption: Notebook:
+"
+
+find ./notebook/*.ipynb -exec echo "   {}" \;
diff --git a/egs2/README.md b/egs2/README.md
@@ -52,6 +52,7 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | librispeech_100         | LibriSpeech ASR corpus 100h subset                                                      | ASR                     | ENG                  | http://www.openslr.org/12                                                                                    |              |
 | libritts                | LibriTTS corpus                                                                         | TTS                     | ENG                  | http://www.openslr.org/60                                                                                    |              |
 | ljspeech                | The LJ Speech Dataset                                                                   | TTS                     | ENG                  | https://keithito.com/LJ-Speech-Dataset/                                                                      |              |
+| lrs3                    | The Oxford-BBC Lip Reading Sentences 3 (LRS3) Dataset                                   | ASR                     | ENG                  | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html                                                  |              |
 | lrs2                    | The Oxford-BBC Lip Reading Sentences 2 (LRS2) Dataset                                   | Lipreading/ASR          | ENG                  | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html                                                  |              |
 | mini_an4                | Mini version of CMU AN4 database for the integration test                               | ASR/TTS/SE              | ENG                  | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
 | mini_librispeech        | Mini version of Librispeech corpus                                                      | DIAR                    | ENG                  | https://openslr.org/31/                                                                                      |              |
@@ -82,7 +83,7 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | timit                   | TIMIT Acoustic-Phonetic Continuous Speech Corpus                                        | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC93S1                                                                        |              |
 | totonac                 | Highland Totonac corpus (endangered language in central Mexico)                         | ASR                     | TOS                  | http://www.openslr.org/107/                                                                                  |              |
 | tsukuyomi               | つくよみちゃんコーパス                                                                      | TTS                     | JPN                  | https://tyc.rei-yumesaki.net/material/corpus                                                                 |              |
-| vctk                    | English Multi-speaker Corpus for CSTR Voice Cloning Toolkit                             | TTS                     | ENG                  | http://www.udialogue.org/download/cstr-vctk-corpus.html                                                      |              |
+| vctk                    | English Multi-speaker Corpus for CSTR Voice Cloning Toolkit                             | ASR/TTS                 | ENG                  | http://www.udialogue.org/download/cstr-vctk-corpus.html                                                      |              |
 | vctk_noisyreverb        | Noisy reverberant speech database (48kHz)                                               | SE                      | ENG                  | https://datashare.ed.ac.uk/handle/10283/2826                                                                 |              |
 | vivos                   | VIVOS (Vietnamese corpus for ASR)                                                       | ASR                     | VIE                  | https://ailab.hcmus.edu.vn/vivos/                                                                            |              |
 | voxforge                | VoxForge                                                                                | ASR                     | 7 languages          | http://www.voxforge.org/                                                                                     |              |
@@ -95,4 +96,3 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | yesno                   | The "yesno" corpus                                                                      | ASR                     | HEB                  | http://www.openslr.org/1                                                                                     |              |
 | yoloxochitl_mixtec      | Yoloxochitl-Mixtec corpus (endangered language in central Mexico)                       | ASR                     | XTY                  | http://www.openslr.org/89                                                                                    |              |
 | zeroth_korean           | Zeroth-Korean                                                                           | ASR                     | KOR                  | http://www.openslr.org/40                                                                                    |              |
-
diff --git a/egs2/TEMPLATE/asr1/asr.sh b/egs2/TEMPLATE/asr1/asr.sh
@@ -110,6 +110,8 @@ k2_config=./conf/decode_asr_transformer_with_k2.yaml
 
 use_streaming=false # Whether to use streaming decoding
 
+use_maskctc=false # Whether to use maskctc decoding
+
 batch_size=1
 inference_tag=    # Suffix to the result dir for decoding.
 inference_config= # Config for decoding.
@@ -224,6 +226,7 @@ Options:
     --inference_asr_model # ASR model path for decoding (default="${inference_asr_model}").
     --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
     --use_streaming       # Whether to use streaming decoding (default="${use_streaming}").
+    --use_maskctc         # Whether to use maskctc decoding (default="${use_streaming}").
 
     # [Task dependent] Set the datadir name created by local/data.sh
     --train_set     # Name of training set (required).
@@ -895,7 +898,7 @@ if ! "${skip_train}"; then
         if "${use_ngram}"; then
             log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt"
             cut -f 2- -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
-            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin 
+            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin
         else
             log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
         fi
@@ -1195,6 +1198,8 @@ if ! "${skip_eval}"; then
         else
           if "${use_streaming}"; then
               asr_inference_tool="espnet2.bin.asr_inference_streaming"
+          elif "${use_maskctc}"; then
+              asr_inference_tool="espnet2.bin.asr_inference_maskctc"
           else
               asr_inference_tool="espnet2.bin.asr_inference"
           fi

diff --git a/egs2/TEMPLATE/asr1/db.sh b/egs2/TEMPLATE/asr1/db.sh
@@ -108,6 +108,7 @@ GOOGLEI18N=downloads
 NOISY_SPEECH=
 NOISY_REVERBERANT_SPEECH=
 LRS2=
+LRS3=
 SUNDA=downloads
 CMU_ARCTIC=downloads
 CMU_INDIC=downloads

diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/score_intent.py b/egs2/TEMPLATE/asr1/pyscripts/utils/score_intent.py
@@ -12,7 +12,7 @@
 import argparse
 
 
-def get_classification_result(hyp_file, ref_file):
+def get_classification_result(hyp_file, ref_file, hyp_write, ref_write):
     hyp_lines = [line for line in hyp_file]
     ref_lines = [line for line in ref_file]
 
@@ -22,6 +22,16 @@ def get_classification_result(hyp_file, ref_file):
         ref_intent = ref_lines[line_count].split(" ")[0]
         if hyp_intent != ref_intent:
             error += 1
+        hyp_write.write(
+            " ".join(hyp_lines[line_count].split("\t")[0].split(" ")[1:])
+            + "\t"
+            + hyp_lines[line_count].split("\t")[1]
+        )
+        ref_write.write(
+            " ".join(ref_lines[line_count].split("\t")[0].split(" ")[1:])
+            + "\t"
+            + ref_lines[line_count].split("\t")[1]
+        )
     return 1 - (error / len(hyp_lines))
 
 
@@ -56,7 +66,16 @@ def get_classification_result(hyp_file, ref_file):
     os.path.join(exp_root, valid_inference_folder + "score_wer/ref.trn")
 )
 
-result = get_classification_result(valid_hyp_file, valid_ref_file)
+valid_hyp_write_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/hyp_asr.trn"), "w"
+)
+valid_ref_write_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/ref_asr.trn"), "w"
+)
+
+result = get_classification_result(
+    valid_hyp_file, valid_ref_file, valid_hyp_write_file, valid_ref_write_file
+)
 print("Valid Intent Classification Result")
 print(result)
 
@@ -66,8 +85,16 @@ def get_classification_result(hyp_file, ref_file):
 test_ref_file = open(
     os.path.join(exp_root, test_inference_folder + "score_wer/ref.trn")
 )
+test_hyp_write_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/hyp_asr.trn"), "w"
+)
+test_ref_write_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/ref_asr.trn"), "w"
+)
 
-result = get_classification_result(test_hyp_file, test_ref_file)
+result = get_classification_result(
+    test_hyp_file, test_ref_file, test_hyp_write_file, test_ref_write_file
+)
 print("Test Intent Classification Result")
 print(result)
 
@@ -79,6 +106,17 @@ def get_classification_result(hyp_file, ref_file):
     utt_test_ref_file = open(
         os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref.trn")
     )
-    result = get_classification_result(utt_test_hyp_file, utt_test_ref_file)
+    utt_test_hyp_write_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/hyp_asr.trn"), "w"
+    )
+    utt_test_ref_write_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref_asr.trn"), "w"
+    )
+    result = get_classification_result(
+        utt_test_hyp_file,
+        utt_test_ref_file,
+        utt_test_hyp_write_file,
+        utt_test_ref_write_file,
+    )
     print("Unseen Utterance Test Intent Classification Result")
     print(result)
diff --git a/egs2/bn_openslr53/asr1/README.md b/egs2/bn_openslr53/asr1/README.md
@@ -0,0 +1,29 @@
+# RESULTS
+## Environments
+- date: `Mon Jan 31 10:53:20 EST 2022`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.6a1`
+- pytorch version: `pytorch 1.8.1+cu102`
+- Git hash: `9d09bf551a9fe090973de60e15adec1de6b3d054`
+  - Commit date: `Fri Jan 21 11:43:15 2022 -0500`
+- Pretrained Model: https://huggingface.co/espnet/bn_openslr53
+
+## asr_train_asr_raw_bpe1000
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe1000_valid.loss.ave_asr_model_valid.acc.best/sbn_test|2018|6470|74.2|21.3|4.5|2.2|28.0|48.8|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe1000_valid.loss.ave_asr_model_valid.acc.best/sbn_test|2018|39196|89.4|4.3|6.3|1.4|12.0|48.8|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe1000_valid.loss.ave_asr_model_valid.acc.best/sbn_test|2018|15595|77.6|12.7|9.7|1.6|24.0|48.7|
+
diff --git a/egs2/chime4/enh1/README.md b/egs2/chime4/enh1/README.md
@@ -6,6 +6,7 @@
 - python version: `3.6.3 |Anaconda, Inc.| (default, Nov 20 2017, 20:41:42)  [GCC 7.2.0]`
 - espnet version: `espnet 0.9.7`
 - pytorch version: `pytorch 1.6.0`
+- Note: PESQ is evaluated based on https://github.com/vBaiCai/python-pesq
 
 
 ## enh_train_enh_conv_tasnet_raw
@@ -25,3 +26,36 @@ config: conf/tuning/train_enh_beamformer_mvdr.yaml
 |---|---|---|---|---|---|---|
 |enhanced_dt05_simu_isolated_6ch_track|2.60|0.94|13.67|13.67|0|12.51|
 |enhanced_et05_simu_isolated_6ch_track|2.63|0.95|15.51|15.51|0|14.65|
+
+<!-- These results are from the code after refactoring  -->
+## enh_train_enh_dc_crn_mapping_snr_raw
+
+config: conf/tuning/train_enh_dc_crn_mapping_snr.yaml
+
+|dataset|PESQ|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|---|
+|enhanced_dt05_simu_isolated_6ch_track|3.10|0.96|17.82|17.82|0.00|17.59|
+|enhanced_et05_simu_isolated_6ch_track|2.95|0.95|17.33|17.33|0.00|17.04|
+
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Sat Mar 19 07:17:45 CST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `648b024d8fb262eb9923c06a698b9c6df5b16e51`
+  - Commit date: `Wed Mar 16 18:47:21 2022 +0800`
+
+
+## enh_train_enh_dprnntac_fasnet_raw
+
+config: conf/tuning/train_enh_dprnntac_fasnet.yaml
+
+Pretrained model: https://huggingface.co/lichenda/chime4_fasnet_dprnn_tac
+
+|dataset|STOI|SAR|SDR|SIR|
+|---|---|---|---|---|
+|enhanced_dt05_simu_isolated_6ch_track|0.95|15.75|15.75|0.00|
+|enhanced_et05_simu_isolated_6ch_track|0.94|15.40|15.40|0.00|
+
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
@@ -53,7 +53,7 @@ separator_conf:
     bunits: 512
     bprojs: 512
     badim: 320
-    ref_channel: 4
+    ref_channel: 3
     use_noise_mask: True
     beamformer_type: mvdr_souden
     bdropout_rate: 0.0

diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml
@@ -0,0 +1,67 @@
+init: xavier_uniform
+max_epoch: 200
+batch_type: folded
+batch_size:  16
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim: adam
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+    amsgrad: true
+patience: 10
+grad_clip: 5
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: steplr
+scheduler_conf:
+    step_size: 2
+    gamma: 0.98
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 128
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 128
+separator: dc_crn
+separator_conf:
+    num_spk: 1
+    input_channels: [10, 16, 32, 64, 128, 256]  # 5x2=10 input channels
+    enc_hid_channels: 8
+    enc_layers: 5
+    glstm_groups: 2
+    glstm_layers: 2
+    glstm_bidirectional: true
+    glstm_rearrange: false
+    mode: mapping
+    ref_channel: 3