Fix conflict

chintu619 · Apr 24, 2022 · 16acdad · 16acdad
2 parents feb28ba + f6a2522
commit 16acdad
Show file tree

Hide file tree

Showing 460 changed files with 17,875 additions and 630 deletions.
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -0,0 +1,48 @@
+name: docker-builder
+
+on:
+  pull_request:
+    types: [closed]
+    branches:
+      - master
+    paths:
+      - 'tools/**'
+      - setup.py
+
+jobs:
+  docker:
+    runs-on: ubuntu-latest
+    if: github.event.pull_request.merged == true
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v1
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v1 
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and push CPU container
+        run: |
+          cd docker
+          docker build --build-arg FROM_TAG=runtime-latest \
+            -f prebuilt/devel.dockerfile \
+            --target devel \
+            -t espnet/espnet:cpu-latest .
+          docker push espnet/espnet:cpu-latest   
+
+      - name: Build and push GPU container
+        run: |
+          cd docker
+          docker build --build-arg FROM_TAG=cuda-latest \
+            --build-arg CUDA_VER=11.1 \
+            -f prebuilt/devel.dockerfile \
+            --target devel \
+            -t espnet/espnet:gpu-latest .
+          docker push espnet/espnet:gpu-latest
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -151,6 +151,11 @@ we recommend using small model parameters and avoiding dynamic imports, file acc
 more running time, you can annotate your test with `@pytest.mark.execution_timeout(sec)`.
 - For test initialization (parameters, modules, etc), you can use pytest fixtures. Refer to  [pytest fixtures](https://docs.pytest.org/en/latest/fixture.html#using-fixtures-from-classes-modules-or-projects) for more information.
 
+In addition, please follow the [PEP 8 convention](https://peps.python.org/pep-0008/) for the coding style and [Google's convention for docstrings](https://google.github.io/styleguide/pyguide.html#383-functions-and-methods).
+Below are some specific points that should be taken care of in particular:
+- [import ordering](https://peps.python.org/pep-0008/#imports)
+- Avoid writing python2-style code. For example, `super().__init__()` is preferred over `super(CLASS_NAME, self).__init()__`.
+
 
 ### 4.2 Bash scripts
 

diff --git a/README.md b/README.md
@@ -77,12 +77,12 @@ ESPnet uses [pytorch](http://pytorch.org/) as a deep learning engine and also fo
 - Self-supervised learning representations as features, using upstream models in [S3PRL](https://github.com/s3prl/s3prl) in frontend.
   - Set `frontend` to be `s3prl`
   - Select any upstream model by setting the `frontend_conf` to the corresponding name.
+- Transfer Learning : 
+  - easy usage and transfers from models previously trained by your group, or models from [ESPnet huggingface repository](https://huggingface.co/espnet).
+  - [Documentation](https://github.com/espnet/espnet/tree/master/egs2/mini_an4/asr1/transfer_learning.md) and [toy example runnable on colab](https://github.com/espnet/notebook/blob/master/espnet2_asr_transfer_learning_demo.ipynb).
 - Streaming Transformer/Conformer ASR with blockwise synchronous beam search.
 - Restricted Self-Attention based on [Longformer](https://arxiv.org/abs/2004.05150) as an encoder for long sequences 
 
-### SUM: Speech Summarization
-- End to End Speech Summarization Recipe for Instructional Videos using Restricted Self-Attention [[Sharma et al., 2022]](https://arxiv.org/abs/2110.06263)
-
 Demonstration
 - Real-time ASR demo with ESPnet2  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_asr_realtime_demo.ipynb)
 - [Gradio](https://github.com/gradio-app/gradio) Web Demo on [Huggingface Spaces](https://huggingface.co/docs/hub/spaces). Check out the [Web Demo](https://huggingface.co/spaces/akhaliq/espnet2_asr)
@@ -141,7 +141,6 @@ To train the neural vocoder, please check the following repositories:
 Demonstration
 - Interactive SE demo with ESPnet2 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1fjRJCh96SoYLZPRxsjF9VDv4Q2VoIckI?usp=sharing)
 
-
 ### ST: Speech Translation & MT: Machine Translation
 - **State-of-the-art performance** in several ST benchmarks (comparable/superior to cascaded ASR and MT)
 - Transformer based end-to-end ST (new!)
@@ -152,9 +151,34 @@ Demonstration
 - End-to-end VC based on cascaded ASR+TTS (Baseline system for Voice Conversion Challenge 2020!)
 
 ### SLU: Speech Language Understanding
-- Predicting intent by directly classifying it as one of intent or decoding by character
-- Transformer & RNN based encoder-decoder model
-- Establish SOTA results with spectral augmentation (Performs better than reported results of pretrained model on Fluent Speech Command Dataset)
+- Architecture
+    - Transformer based Encoder
+    - Conformer based Encoder
+    - RNN based Decoder
+    - Transformer based Decoder
+- Support Multitasking with ASR
+    - Predict both intent and ASR transcript
+- Support Multitasking with NLU
+    - Deliberation encoder based 2 pass model
+- Support using pretrained ASR models
+    - Hubert
+    - Wav2vec2
+    - VQ-APC
+    - TERA and more ...
+- Support using pretrained NLP models
+    - BERT
+    - MPNet And more...
+- Various language support
+    - En / Jp / Zn / Nl / And more...
+- Supports using context from previous utterances
+- Supports using other tasks like SE in pipeline manner
+Demonstration
+- Performing noisy spoken language understanding using speech enhancement model followed by spoken language understanding model.  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/14nCrJ05vJcQX0cJuXjbMVFWUHJ3Wfb6N?usp=sharing)
+- Integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See SLU demo on multiple languages: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Siddhant/ESPnet2-SLU)
+
+
+### SUM: Speech Summarization
+- End to End Speech Summarization Recipe for Instructional Videos using Restricted Self-Attention [[Sharma et al., 2022]](https://arxiv.org/abs/2110.06263)
 
 ### DNN Framework
 - Flexible network architecture thanks to chainer and pytorch
@@ -532,11 +556,33 @@ You can download converted samples of the cascade ASR+TTS baseline system [here]
 
 ### SLU results
 
-<details><summary>ESPnet2</summary><div>
+<details><summary>expand</summary><div>
+
+
+We list the performance on various SLU tasks and dataset using the metric reported in the original dataset paper
+
+| Task                                                              | Dataset                                                              |    Metric     |     Result     |                                                                              Pretrained Model                                         |
+| ----------------------------------------------------------------- | :-------------: | :-------------: | :-------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Intent Classification                                                 |     SLURP     |       Acc       |       86.3       |                [link](https://github.com/espnet/espnet/tree/master/egs2/slurp/asr1/README.md)                |
+| Intent Classification                                                   |     FSC     |       Acc       |       99.6       |                [link](https://github.com/espnet/espnet/tree/master/egs2/fsc/asr1/README.md)                |
+| Intent Classification                                                  |     FSC Unseen Speaker Set     |       Acc       |       98.6       |                [link](https://github.com/espnet/espnet/tree/master/egs2/fsc_unseen/asr1/README.md)                |
+| Intent Classification                                                   |     FSC Unseen Utterance Set     |       Acc       |       86.4       |                [link](https://github.com/espnet/espnet/tree/master/egs2/fsc_unseen/asr1/README.md)                |
+| Intent Classification                                                   |     FSC Challenge Speaker Set     |       Acc       |       97.5       |                [link](https://github.com/espnet/espnet/tree/master/egs2/fsc_challenge/asr1/README.md)                |
+| Intent Classification                                                   |     FSC Challenge Utterance Set     |       Acc       |       78.5       |                [link](https://github.com/espnet/espnet/tree/master/egs2/fsc_challenge/asr1/README.md)                |
+| Intent Classification                                                   |     SNIPS     |       F1       |       91.7       |                [link](https://github.com/espnet/espnet/tree/master/egs2/snips/asr1/README.md)                |
+| Intent Classification                                                   |     Grabo (Nl)   |       Acc       |       97.2       |                [link](https://github.com/espnet/espnet/tree/master/egs2/grabo/asr1/README.md)                |
+| Intent Classification                                                   |     CAT SLU MAP (Zn)     |       Acc       |       78.9       |                [link](https://github.com/espnet/espnet/tree/master/egs2/catslu/asr1/README.md)                |
+| Intent Classification                                                  |     Google Speech Commands    |       Acc       |       98.4       |                [link](https://github.com/espnet/espnet/tree/master/egs2/speechcommands/asr1/README.md)                |
+| Slot Filling                                                  |     SLURP     |       SLU-F1       |       71.9       |                [link](https://github.com/espnet/espnet/tree/master/egs2/slurp_entity/asr1/README.md)                |
+| Dialogue  Act Classification                                                 |     Switchboard     |       Acc       |       67.5       |                [link](https://github.com/espnet/espnet/tree/master/egs2/swbd_da/asr1/README.md)                |
+| Dialogue  Act Classification                                                 |     Jdcinal (Jp)    |       Acc       |       67.4       |                [link](https://github.com/espnet/espnet/tree/master/egs2/jdcinal/asr1/README.md)                |
+| Emotion Recognition                                                  |     IEMOCAP     |       Acc       |       69.4       |                [link](https://github.com/espnet/espnet/tree/master/egs2/iemocap/asr1/README.md)                |
+| Emotion Recognition                                                  |     swbd_sentiment     |       Macro F1       |       61.4       |                [link](https://github.com/espnet/espnet/tree/master/egs2/swbd_sentiment/asr1/README.md)                | 
+| Emotion Recognition                                                  |     slue_voxceleb     |       Macro F1       |       44.0       |                [link](https://github.com/espnet/espnet/tree/master/egs2/slue-voxceleb/asr1/README.md)                | 
 
-- Transformer based SLU for Fluent Speech Command Dataset
+
+If you want to check the results of the other recipes, please check `egs2/<name_of_recipe>/asr1/RESULTS.md`.
 
-In SLU, The objective is to infer the meaning or intent of spoken utterance. The [Fluent Speech Command Dataset](https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/) describes an intent as combination of 3 slot values: action, object and location. You can see baseline results on this dataset [here](https://github.com/espnet/espnet/blob/master/egs2/fsc/asr1/RESULTS.md)
 
 
 </div></details>
@@ -689,6 +735,8 @@ See the module documentation for more information.
 It is recommended to use models with RNN-based encoders (such as BLSTMP) for aligning large audio files;
 rather than using Transformer models that have a high memory consumption on longer audio data.
 The sample rate of the audio must be consistent with that of the data used in training; adjust with `sox` if needed.
+  
+Also, we can use this tool to provide token-level segmentation information if we prepare a list of tokens instead of that of utterances in the `text` file. See the discussion in https://github.com/espnet/espnet/issues/4278#issuecomment-1100756463.
 
 </div></details>
 

diff --git a/egs/README.md b/egs/README.md
@@ -8,6 +8,7 @@ See: https://espnet.github.io/espnet/tutorial.html
 | Directory name          | Corpus name                                                  | Task                                       | Language       | URL                                                          | Note                          |
 | ----------------------- | ------------------------------------------------------------ | ------------------------------------------ | -------------- | ------------------------------------------------------------ | ----------------------------- |
 ||||
+| aesrc2020               | Accented English Speech Recognition Challenge 2020           | ASR                                        | EN             | https://arxiv.org/abs/2102.10233                                   |                               |
 | aidatatang_200zh        | Aidatatang_200zh A free Chinese Mandarin speech corpus       | ASR                                        | ZH             | http://www.openslr.org/62/                                   |                               |
 | aishell                 | AISHELL-ASR0009-OS1 Open Source Mandarin Speech Corpus       | ASR                                        | ZH             | http://www.aishelltech.com/kysjcp                            |                               |
 | aishell2                | AISHELL-2 Open Source Mandarin Speech Corpus                 | ASR                                        | ZH             | http://www.aishelltech.com/aishell_2                                                         |
@@ -49,7 +50,8 @@ See: https://espnet.github.io/espnet/tutorial.html
 | librispeech             | LibriSpeech ASR corpus                                       | ASR                                        | EN             | http://www.openslr.org/12                                    |                               |
 | libritts                | LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech | TTS                                      | EN             | http://www.openslr.org/60/                                   |                               |
 | ljspeech                | The LJ Speech Dataset                                        | TTS                                        | EN             | https://keithito.com/LJ-Speech-Dataset/                      |                               |
-| lrs                     | The Lip Reading Sentences Dataset                            | ASR/AVSR                                       | EN             | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html                      |                               |
+| lrs2                     | The Lip Reading Sentences 2 Dataset                            | ASR                                       | ENG             | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html                      |                               |
+| lrs                     | The Lip Reading Sentences 2 and 3 Dataset                            | AVSR                                       | ENG             | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html  https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html                     |                               |
 | m_ailabs                | The M-AILABS Speech Dataset                                  | TTS                                        | ~5 languages   | https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/    |
 | mucs_2021               | MUCS 2021: MUltilingual and Code-Switching ASR Challenges for Low Resource Indian Languages   | ASR/Code Switching          | HI, MR, OR, TA, TE, GU, HI-EN, BN-EN | https://navana-tech.github.io/MUCS2021/data.html                    |                               |
 | mtedx                   | Multilingual TEDx | ASR/Machine Translation/Speech Translation | 13 Language pairs | http://www.openslr.org/100/                         |

diff --git a/egs/aesrc2020/asr1/RESULTS.md b/egs/aesrc2020/asr1/RESULTS.md
diff --git a/egs/lrs/asr1/cmd.sh → egs/aesrc2020/asr1/cmd.sh b/egs/lrs/asr1/cmd.sh → egs/aesrc2020/asr1/cmd.sh
diff --git a/egs/aesrc2020/asr1/conf/decode.yaml b/egs/aesrc2020/asr1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_pytorch_transformer.yaml
diff --git a/egs/lrs/asr1/conf/fbank.conf → egs/aesrc2020/asr1/conf/fbank.conf b/egs/lrs/asr1/conf/fbank.conf → egs/aesrc2020/asr1/conf/fbank.conf
diff --git a/egs/lrs/asr1/conf/gpu.conf → egs/aesrc2020/asr1/conf/gpu.conf b/egs/lrs/asr1/conf/gpu.conf → egs/aesrc2020/asr1/conf/gpu.conf
diff --git a/egs/aesrc2020/asr1/conf/lm.yaml b/egs/aesrc2020/asr1/conf/lm.yaml
@@ -0,0 +1,8 @@
+# rnnlm related
+layer: 2
+unit: 650
+opt: sgd        # or adam
+batchsize: 64   # batch size in LM training
+epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+maxlen: 100     # if sentence length > lm_maxlen, lm_batchsize is automatically reduced
diff --git a/egs/lrs/asr1/conf/pitch.conf → egs/aesrc2020/asr1/conf/pitch.conf b/egs/lrs/asr1/conf/pitch.conf → egs/aesrc2020/asr1/conf/pitch.conf
diff --git a/egs/lrs/asr1/conf/queue.conf → egs/aesrc2020/asr1/conf/queue.conf b/egs/lrs/asr1/conf/queue.conf → egs/aesrc2020/asr1/conf/queue.conf
diff --git a/egs/lrs/asr1/conf/slurm.conf → egs/aesrc2020/asr1/conf/slurm.conf b/egs/lrs/asr1/conf/slurm.conf → egs/aesrc2020/asr1/conf/slurm.conf
diff --git a/egs/aesrc2020/asr1/conf/specaug.yaml b/egs/aesrc2020/asr1/conf/specaug.yaml
@@ -0,0 +1,16 @@
+process:
+  # these three processes are a.k.a. SpecAugument
+  - type: "time_warp"
+    max_time_warp: 5
+    inplace: true
+    mode: "PIL"
+  - type: "freq_mask"
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: "time_mask"
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
diff --git a/egs/aesrc2020/asr1/conf/train.yaml b/egs/aesrc2020/asr1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_pytorch_conformer_kernel15.yaml
diff --git a/egs/aesrc2020/asr1/conf/tuning/decode_pytorch_transformer.yaml b/egs/aesrc2020/asr1/conf/tuning/decode_pytorch_transformer.yaml
@@ -0,0 +1,8 @@
+batchsize: 0
+beam-size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.5
+lm-weight: 0.3
+ngram-weight: 0.3
diff --git a/egs/aesrc2020/asr1/conf/tuning/decode_rnn.yaml b/egs/aesrc2020/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+beam-size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.6
+lm-weight: 0.3
diff --git a/egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml b/egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml
@@ -0,0 +1,47 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 50
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 1.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 15
diff --git a/egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel31.yaml b/egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel31.yaml
@@ -0,0 +1,47 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 50
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 1.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 31