diff --git a/.coveragerc b/.coveragerc index 8b036f1f426..8a5a2a144ec 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,4 +1,7 @@ [report] +omit = + tools/* + # Regexes for lines to exclude from consideration exclude_lines = # Have to re-enable the standard pragma diff --git a/.github/workflows/cancel.yml b/.github/workflows/cancel.yml new file mode 100644 index 00000000000..7027d78785d --- /dev/null +++ b/.github/workflows/cancel.yml @@ -0,0 +1,13 @@ +name: Cancel +on: + workflow_run: + workflows: ["CI", "centos7", "debian9", "doc"] + types: + - requested +jobs: + cancel: + runs-on: ubuntu-latest + steps: + - uses: styfle/cancel-workflow-action@0.9.1 + with: + workflow_id: ${{ github.event.workflow.id }} diff --git a/.github/workflows/centos7.yml b/.github/workflows/centos7.yml index e1373ee14c0..94d5973e859 100644 --- a/.github/workflows/centos7.yml +++ b/.github/workflows/centos7.yml @@ -9,25 +9,24 @@ on: - master jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - linter_and_test: + test_centos7: runs-on: ubuntu-latest container: image: centos:7 env: - ESPNET_PYTHON_VERSION: 3.6 - TH_VERSION: 1.8.0 + ESPNET_PYTHON_VERSION: 3.7 + # NOTE: 1.9.0 raised libstdc++ version errors in pyworld. + # ImportError: /lib64/libstdc++.so.6: version `CXXABI_1.3.8' not found + # (required by /__w/espnet/espnet/tools/venv/envs/espnet/lib/python3.6/site-packages/pyworld/pyworld.cpython-36m-x86_64-linux-gnu.so) + # NOTE(kamo): The issue doens't exist for python3.7? + TH_VERSION: 1.10.1 CHAINER_VERSION: 6.0.0 USE_CONDA: true CC: /opt/rh/devtoolset-7/root/usr/bin/gcc CXX: /opt/rh/devtoolset-7/root/usr/bin/g++ + MAKE: /opt/rh/devtoolset-7/root/usr/bin/make # To avoid UnicodeEncodeError for python<=3.6 LC_ALL: en_US.UTF-8 - needs: check_skip steps: - uses: actions/checkout@master - name: check OS @@ -35,9 +34,9 @@ jobs: - name: install dependencies run: | # NOTE(kamo): cmake sndfile will be download using anacond: - yum install -y git centos-release-scl make bzip2 wget which unzip bc patch + yum install -y git centos-release-scl bzip2 wget which unzip bc patch yum-config-manager --enable rhel-server-rhscl-7-rpms - yum install -y devtoolset-7-gcc-c++ sox + yum install -y devtoolset-7-gcc-c++ devtoolset-7-make sox ncurses-devel libtool automake autoconf localedef -f UTF-8 -i en_US en_US - name: install espnet run: | diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 718254bdcb6..f1eb6fb47ae 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -9,42 +9,28 @@ on: - master jobs: - check_skip: - runs-on: ubuntu-18.04 - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" linter_and_test: runs-on: ${{ matrix.os }} - needs: check_skip strategy: max-parallel: 20 matrix: - # os: [ubuntu-16.04, ubuntu-18.04] os: [ubuntu-18.04] - python-version: [3.7, 3.8] - pytorch-version: [1.0.1, 1.1.0, 1.2.0, 1.3.1, 1.4.0, 1.5.1, 1.6.0, 1.7.1, 1.8.0] + python-version: [3.7] + pytorch-version: [1.4.0, 1.5.1, 1.6.0, 1.7.1, 1.8.1, 1.9.1, 1.10.1] chainer-version: [6.0.0] # NOTE(kamo): Conda is tested by Circle-CI use-conda: [false] - exclude: - # Exclude python=3.8 tests except for latest pytorch - - python-version: 3.8 - pytorch-version: 1.0.1 - - python-version: 3.8 - pytorch-version: 1.1.0 - - python-version: 3.8 - pytorch-version: 1.2.0 - - python-version: 3.8 - pytorch-version: 1.3.1 - - python-version: 3.8 - pytorch-version: 1.4.0 - - python-version: 3.8 - pytorch-version: 1.5.1 - - python-version: 3.8 - pytorch-version: 1.6.0 - - python-version: 3.8 - pytorch-version: 1.7.1 + include: + - os: ubuntu-20.04 + python-version: 3.8 + pytorch-version: 1.10.1 + chainer-verssion: 6.0.0 + use-conda: false + - os: ubuntu-20.04 + python-version: 3.9 + pytorch-version: 1.10.1 + chainer-verssion: 6.0.0 + use-conda: false steps: - uses: actions/checkout@master - uses: actions/cache@v1 @@ -69,16 +55,47 @@ jobs: CXX: g++-7 run: | ./ci/install.sh + - name: test shell run: | ./ci/test_shell.sh + - name: test python + run: ./ci/test_python.sh + - uses: codecov/codecov-action@v2 + with: + flags: test_python + - name: coverage erase run: | - ./ci/test_python.sh + source tools/activate_python.sh + coverage erase + - name: install kaldi run: | ./ci/install_kaldi.sh - - name: test integration + + - name: test utils + run: ./ci/test_utils.sh + - uses: codecov/codecov-action@v2 + with: + flags: test_utils + - name: coverage erase + run: | + source tools/activate_python.sh + coverage erase + + - name: test espnet1 integration + run: ./ci/test_integration_espnet1.sh + - uses: codecov/codecov-action@v2 + with: + flags: test_integration_espnet1 + - name: coverage erase run: | - ./ci/test_integration.sh - - uses: codecov/codecov-action@v1 + source tools/activate_python.sh + coverage erase + + - name: test espnet2 integration + run: ./ci/test_integration_espnet2.sh + - uses: codecov/codecov-action@v2 + with: + flags: test_integration_espnet2 diff --git a/.github/workflows/debian9.yml b/.github/workflows/debian9.yml index e4288e8a836..a29e5474ad4 100644 --- a/.github/workflows/debian9.yml +++ b/.github/workflows/debian9.yml @@ -9,25 +9,19 @@ on: - master jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - linter_and_test: + test_debian9: runs-on: ubuntu-latest container: image: debian:9 env: - ESPNET_PYTHON_VERSION: 3.6 - TH_VERSION: 1.8.0 + ESPNET_PYTHON_VERSION: 3.7 + TH_VERSION: 1.10.1 CHAINER_VERSION: 6.0.0 USE_CONDA: true CC: gcc-6 CXX: g++-6 # To avoid UnicodeEncodeError for python<=3.6 LC_ALL: en_US.UTF-8 - needs: check_skip steps: - uses: actions/checkout@master - name: check OS @@ -36,7 +30,9 @@ jobs: run: | apt-get update -qq # NOTE(kamo): cmake sndfile will be download using anacond: - apt-get install -qq -y build-essential git g++-6 unzip bzip2 wget curl bc locales make sox + apt-get install -qq -y \ + build-essential git g++-6 unzip bzip2 wget curl bc locales make sox \ + libncurses5-dev automake libtool pkg-config localedef -f UTF-8 -i en_US en_US - name: install espnet run: ./ci/install.sh @@ -46,6 +42,9 @@ jobs: run: ./ci/test_python.sh - name: install kaldi run: ./ci/install_kaldi.sh - - name: test integration - run: ./ci/test_integration.sh - - uses: codecov/codecov-action@v1 + - name: test utils + run: ./ci/test_utils.sh + - name: test espnet1 integration + run: ./ci/test_integration_espnet1.sh + - name: test espnet2 integration + run: ./ci/test_integration_espnet2.sh diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml new file mode 100644 index 00000000000..eede0bc8044 --- /dev/null +++ b/.github/workflows/doc.yml @@ -0,0 +1,46 @@ +name: doc + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + linter_and_test: + runs-on: ubuntu-18.04 + steps: + - uses: actions/checkout@master + - uses: actions/cache@v1 + with: + path: ~/.cache/pip + key: pip-${{ hashFiles('**/setup.py') }} + - uses: actions/setup-python@v1 + with: + python-version: 3.8 + architecture: 'x64' + - name: check OS + run: cat /etc/os-release + - name: install dependencies + run: | + sudo apt-get update -qq + sudo apt-get install -qq -y cmake python3-dev git g++-7 pandoc ffmpeg bc + - name: install espnet + env: + ESPNET_PYTHON_VERSION: 3.8 + TH_VERSION: 1.10.1 + CHAINER_VERSION: 6.0.0 + USE_CONDA: false + CC: gcc-7 + CXX: g++-7 + run: ./ci/install.sh + - name: generate doc + run: ./ci/doc.sh + - name: deploy + if: github.ref == 'refs/heads/master' + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: doc/build diff --git a/.github/workflows/test_import.yaml b/.github/workflows/test_import.yaml new file mode 100644 index 00000000000..ead9f587c07 --- /dev/null +++ b/.github/workflows/test_import.yaml @@ -0,0 +1,51 @@ +name: Test import espnet + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + test_import: + runs-on: ${{ matrix.os }} + strategy: + max-parallel: 20 + matrix: + os: [ubuntu-latest] + python-version: [3.9] + pytorch-version: [1.10.1] + steps: + - uses: actions/checkout@v2 + - uses: actions/cache@v1 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-${{ hashFiles('**/setup.py') }} + - name: Set up Python + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + sudo apt-get install -qq -y libsndfile1-dev + python3 -m pip install --upgrade pip + - name: Install espnet with the least requirement + env: + TH_VERSION: ${{ matrix.pytorch-version }} + run: | + ./tools/installers/install_torch.sh false ${TH_VERSION} CPU + ./tools/installers/install_chainer.sh CPU + python3 -m pip install -e . + - name: Import all modules (Try1) + run: | + python3 ./ci/test_import_all.py + - name: Install espnet with the full requirement + env: + TH_VERSION: ${{ matrix.pytorch-version }} + run: | + python3 -m pip install -e ".[all]" + - name: Import all modules (Try2) + run: | + python3 ./ci/test_import_all.py diff --git a/.gitignore b/.gitignore index deff1897ea8..7170a376705 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,7 @@ test_spm.model .vscode* *.vim *.swp +*.nfs* # recipe related egs*/*/*/data* @@ -39,8 +40,13 @@ egs*/*/*/mfcc egs*/*/*/stft egs*/*/*/tensorboard egs*/*/*/wav* +egs*/*/*/nltk* +egs*/*/*/.cache* +egs*/*/*/pretrained_models* +egs*/fisher_callhome_spanish/*/local/mapping* # tools related +tools/chainer tools/bin tools/include tools/lib @@ -48,6 +54,7 @@ tools/lib64 tools/bats-core tools/chainer_ctc/ tools/kaldi* +tools/activate_python.sh tools/miniconda.sh tools/moses/ tools/mwerSegmenter/ @@ -62,6 +69,8 @@ tools/PESQ* tools/hts_engine_API* tools/open_jtalk* tools/pyopenjtalk* +tools/tdmelodic_openjtalk* +tools/s3prl tools/sctk* tools/sph2pipe* tools/espeak-ng* @@ -69,3 +78,5 @@ tools/MBROLA* tools/festival* tools/speech_tools* tools/phonemizer* +tools/py3mmseg +tools/._* diff --git a/.gitmodules b/.gitmodules index bc771d8c6ee..e69de29bb2d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "doc/notebook"] - path = doc/notebook - url = https://github.com/espnet/notebook diff --git a/.mergify.yml b/.mergify.yml index 8e6872169ac..0304250182c 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -2,19 +2,22 @@ pull_request_rules: - name: automatic merge if label=auto-merge conditions: - "label=auto-merge" - - "status-success=linter_and_test (ubuntu-18.04, 3.7, 1.0.1, 6.0.0, false)" - - "status-success=linter_and_test (ubuntu-18.04, 3.7, 1.1.0, 6.0.0, false)" - - "status-success=linter_and_test (ubuntu-18.04, 3.7, 1.2.0, 6.0.0, false)" - - "status-success=linter_and_test (ubuntu-18.04, 3.7, 1.3.1, 6.0.0, false)" - - "status-success=linter_and_test (ubuntu-18.04, 3.7, 1.4.0, 6.0.0, false)" - - "status-success=linter_and_test (ubuntu-18.04, 3.7, 1.5.1, 6.0.0, false)" - - "status-success=linter_and_test (ubuntu-18.04, 3.7, 1.6.0, 6.0.0, false)" - - "status-success=linter_and_test (ubuntu-18.04, 3.7, 1.7.1, 6.0.0, false)" - - "status-success=linter_and_test (ubuntu-18.04, 3.8, 1.8.0, 6.0.0, false)" + - "check-success=test_centos7" + - "check-success=test_debian9" + - "check-success=linter_and_test (ubuntu-18.04, 3.7, 1.3.1, 6.0.0, false)" + - "check-success=linter_and_test (ubuntu-18.04, 3.7, 1.4.0, 6.0.0, false)" + - "check-success=linter_and_test (ubuntu-18.04, 3.7, 1.5.1, 6.0.0, false)" + - "check-success=linter_and_test (ubuntu-18.04, 3.7, 1.6.0, 6.0.0, false)" + - "check-success=linter_and_test (ubuntu-18.04, 3.7, 1.7.1, 6.0.0, false)" + - "check-success=linter_and_test (ubuntu-18.04, 3.7, 1.8.1, 6.0.0, false)" + - "check-success=linter_and_test (ubuntu-18.04, 3.7, 1.9.1, 6.0.0, false)" + - "check-success=linter_and_test (ubuntu-18.04, 3.7, 1.10.1, 6.0.0, false)" + - "check-success=linter_and_test (ubuntu-20.04, 3.8, 1.10.1, false, 6.0.0)" + - "check-success=linter_and_test (ubuntu-20.04, 3.9, 1.10.1, false, 6.0.0)" + - "check-success=test_import (ubuntu-latest, 3.9, 1.10.1)" actions: merge: method: merge - strict: false - name: delete head branch after merged conditions: - merged diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index be2f40c96df..00000000000 --- a/.travis.yml +++ /dev/null @@ -1,47 +0,0 @@ -dist: xenial -language: python -python: - - "3.8" - -cache: - - pip - - ccache - -addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - cmake - - python3-dev - - g++-7 - - pandoc - - ffmpeg - - bc - -env: - - USE_CONDA=false ESPNET_PYTHON_VERSION=3.8 TH_VERSION=1.8.0 CHAINER_VERSION=6.0.0 CC=gcc-7 CXX=g++-7 - # torch nightly - # - USE_CONDA=false ESPNET_PYTHON_VERSION=3.7.3 TH_VERSION=nightly CHAINER_VERSION=6.0.0 CC=gcc-7 CXX=g++-7 - -matrix: - allow_failures: - # torch nightly - # - env: USE_CONDA=false ESPNET_PYTHON_VERSION=3.7.3 TH_VERSION=nightly CHAINER_VERSION=6.0.0 CC=gcc-7 CXX=g++-7 - -install: - - travis_retry ./ci/install.sh - # - travis_retry ./ci/install_kaldi.sh - -script: - # NOTE(kamo): unittests and build documentation only - # - ./ci/test_shell.sh - - ./ci/test_python.sh - # - ./ci/test_integration.sh - - ./ci/doc.sh - -sudo: false - -after_success: - # - bash <(curl -s https://codecov.io/bash) - - travis-sphinx deploy -m "Update documentation [ci skip]" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 31eca6141e1..979e7397012 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,19 +1,19 @@ # How to contribute to ESPnet ## 1. What to contribute -If you are interested in contributing to ESPnet, your contributions will fall into three categories: major features, minor updates, and recipes. +If you are interested in contributing to ESPnet, your contributions will fall into three categories: major features, minor updates, and recipes. ### 1.1 Major features -If you want to ask or propose a new feature, please first open a new issue with the tag `Feature request` -or directly contact Shinji Watanabe or other main developers. Each feature implementation +If you want to ask or propose a new feature, please first open a new issue with the tag `Feature request` +or directly contact Shinji Watanabe or other main developers. Each feature implementation and design should be discussed and modified according to ongoing and future works. -You can find ongoing major development plans at https://github.com/espnet/espnet/milestones +You can find ongoing major development plans at https://github.com/espnet/espnet/milestones or in https://github.com/espnet/espnet/issues (pinned issues) ### 1.2 Minor Updates (minor feature, bug-fix for an issue) -If you want to propose a minor feature, update an existing minor feature, or fix a bug, please first take a look at +If you want to propose a minor feature, update an existing minor feature, or fix a bug, please first take a look at the existing [issues](https://github.com/espnet/espnet/pulls) and/or [pull requests](https://github.com/espnet/espnet/pulls). Pick an issue and comment on the task that you want to work on this feature. @@ -21,26 +21,26 @@ If you need help or additional information to propose the feature, you can open ### 1.3 Recipes -ESPnet provides and maintains many example scripts, called `recipes`, that demonstrate how to +ESPnet provides and maintains many example scripts, called `recipes`, that demonstrate how to use the toolkit. The recipes for ESPnet1 are put under `egs` directory, while ESPnet2 ones are put under `egs2`. Similar to Kaldi, each subdirectory of `egs` and `egs2` corresponds to a corpus that we have example scripts for. #### 1.3.1 ESPnet1 recipes -ESPnet1 recipes (`egs/X`) follow the convention from [Kaldi](https://github.com/kaldi-asr/kaldi) and may rely on -several utilities available in Kaldi. As such, porting a new recipe from Kaldi to ESPnet is natural, and the user +ESPnet1 recipes (`egs/X`) follow the convention from [Kaldi](https://github.com/kaldi-asr/kaldi) and may rely on +several utilities available in Kaldi. As such, porting a new recipe from Kaldi to ESPnet is natural, and the user may refer to [port-kaldi-recipe](https://github.com/espnet/espnet/wiki/How-to-port-the-Kaldi-recipe-to-the-ESPnet-recipe%3F) -and other existing recipes for new additions. For the Kaldi-style recipe architecture, please refer to +and other existing recipes for new additions. For the Kaldi-style recipe architecture, please refer to [Prepare-Kaldi-Style-Directory](https://kaldi-asr.org/doc/data_prep.html). - -For each recipe, we ask you to report the following: experiments results and environnement, model information. -For reproducibility, a link to upload the pre-trained model may also be added. All this information should be written -in a markdown file called `RESULTS.md` and put at the recipe root. You can refer to + +For each recipe, we ask you to report the following: experiments results and environnement, model information. +For reproducibility, a link to upload the pre-trained model may also be added. All this information should be written +in a markdown file called `RESULTS.md` and put at the recipe root. You can refer to [tedlium2-example](https://github.com/espnet/espnet/blob/master/egs/tedlium2/asr1/RESULTS.md) for an example. - + To generate `RESULTS.md` for a recipe, please follow the following instructions: -- Execute `~/espnet/utils/show_result.sh` at the recipe root (where `run.sh` is located). -You'll get your environment information and evaluation results for each experiment in a markdown format. +- Execute `~/espnet/utils/show_result.sh` at the recipe root (where `run.sh` is located). +You'll get your environment information and evaluation results for each experiment in a markdown format. From here, you can copy or redirect text output to `RESULTS.md`. - Execute `~/espnet/utils/pack_model.sh` at the recipe root to generate a packed ESPnet model called `model.tar.gz` and output model information. Executing the utility script without argument will give you the expected arguments. @@ -50,30 +50,62 @@ and output model information. Executing the utility script without argument will #### 1.3.2 ESPnet2 recipes ESPnet2's recipes correspond to `egs2`. ESPnet2 applies a new paradigm without dependencies of Kaldi's binaries, which makes it lighter and more generalized. -For ESPnet2, we do not recommend preparing the recipe's stages for each corpus but using the common pipelines we provided in `asr.sh`, `tts.sh`, and +For ESPnet2, we do not recommend preparing the recipe's stages for each corpus but using the common pipelines we provided in `asr.sh`, `tts.sh`, and `enh.sh`. For details of creating ESPnet2 recipes, please refer to [egs2-readme](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/README.md). -The common pipeline of ESPnet2 recipes will take care of the `RESULTS.md` generation, model packing, and uploading. ESPnet2 models are maintained at Zenodo. +The common pipeline of ESPnet2 recipes will take care of the `RESULTS.md` generation, model packing, and uploading. ESPnet2 models are maintained at Hugging Face and Zenodo (Deprecated). You can also refer to the document in https://github.com/espnet/espnet_model_zoo -To upload your model, you need first: +To upload your model, you need first (This is currently deprecated , uploading to Huggingface Hub is prefered) : 1. Sign up to Zenodo: https://zenodo.org/ 2. Create access token: https://zenodo.org/account/settings/applications/tokens/new/ 3. Set your environment: % export ACCESS_TOKEN="" +To port models from zenodo using Hugging Face hub, +1. Create a Hugging Face account - https://huggingface.co/ +2. Request to be added to espnet organisation - https://huggingface.co/espnet +3. Go to `egs2/RECIPE/*/scripts/utils` and run `./upload_models_to_hub.sh "ZENODO_MODEL_NAME"` + +To upload models using Huggingface-cli follow the following steps: +You can also refer to https://huggingface.co/docs/transformers/model_sharing +1. Create a Hugging Face account - https://huggingface.co/ +2. Request to be added to espnet organisation - https://huggingface.co/espnet +3. Run huggingface-cli login (You can get the token request at this step under setting > Access Tokens > espnet token +4. `huggingface-cli repo create your-model-name --organization espnet` +5. `git clone https://huggingface.co/username/your-model-name` (clone this outside ESPNet to avoid issues as this a git repo) +6. `cd your-model-name` +7. `git lfs install` +8. copy contents from exp diretory of your recipe into this directory (Check other models of similar task under ESPNet to confirm your directory structure) +9. `git add . ` +10. `git commit -m "Add model files"` +11. `git push` +12. Check if the inference demo on HF is running successfully to verify the upload + #### 1.3.3 Additional requirements for new recipe - Common/shared files and directories such as `utils`, `steps`, `asr.sh`, etc, should be linked using -a symbolic link (e.g.: `ln -s `). Please refer to existing recipes if you're +a symbolic link (e.g.: `ln -s `). Please refer to existing recipes if you're unaware which files/directories are shared. Noted that in espnet2, some of them are automatically generated by https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/setup.sh. -- Default training and decoding configurations (i.e.: the default one in `run.sh`) should be named respectively `train.yaml` +- Default training and decoding configurations (i.e.: the default one in `run.sh`) should be named respectively `train.yaml` and `decode.yaml` and put in `conf/`. Additional or variant configurations should be put in `conf/tuning/` and named accordingly -to its differences. +to its differences. - If a recipe for a new corpus is proposed, you should add its name and information to: -https://github.com/espnet/espnet/blob/master/egs/README.md if it's a ESPnet1 recipe, +https://github.com/espnet/espnet/blob/master/egs/README.md if it's a ESPnet1 recipe, or https://github.com/espnet/espnet/blob/master/egs2/README.md + `db.sh` if it's a ESPnet2 recipe. + +#### 1.3.4 Checklist before you submit the recipe-based PR + +- [ ] be careful about the name for the recipe. It is recommended to follow naming conventions of the other recipes +- [ ] common/shared files are linked with **soft link** (see Section 1.3.3) +- [ ] modified or new python scripts should be passed through **latest** black formating (by using python package black). The command to be executed could be `black espnet espnet2 test utils setup.py egs*/*/*/local egs2/TEMPLATE/asr1/pyscripts` +- [ ] cluster settings should be set as **default** (e.g., cmd.sh conf/slurm.conf conf/queue.conf conf/pbs.conf) +- [ ] update `egs/README.md` or `egs2/README.md` with corresponding recipes +- [ ] add corresponding entry in `egs2/TEMPLATE/db.sh` for a new corpus +- [ ] try to **simplify** the model configurations. We recommend to have only the best configuration for the start of a recipe. Please also follow the default rule defined in Section 1.3.3 +- [ ] large meta-information for a corpus should be maintained elsewhere other than in the recipe itself +- [ ] recommend to also include results and pre-trained model with the recipe ## 2 Pull Request -If your proposed feature or bugfix is ready, please open a Pull Request (PR) at https://github.com/espnet/espnet +If your proposed feature or bugfix is ready, please open a Pull Request (PR) at https://github.com/espnet/espnet or use the Pull Request button in your forked repo. If you're not familiar with the process, please refer to the following guides: - http://stackoverflow.com/questions/14680711/how-to-do-a-github-pull-request @@ -85,7 +117,7 @@ We basically develop in the `master` branch. 1. We will keep the first version digit `0` until we have some super major changes in the project organization level. -2. The second version digit will be updated when we have major updates, including new functions and refactoring, and +2. The second version digit will be updated when we have major updates, including new functions and refactoring, and their related bug fix and recipe changes. This version update will be done roughly every half year so far (but it depends on the development plan). @@ -114,11 +146,11 @@ have the format `def test_yyy(...)`. [Pytest](https://docs.pytest.org/en/latest Technically, a test file should only cover methods from one file (e.g.: `test_transformer_utils.py` to test `transformer_utils.py`). - To monitor test coverage and avoid the overlapping test, we recommend using `pytest --cov-report term-missing ` to highlight covered and missed lines. For more details, please refer to [coverage-test](https://pytest-cov.readthedocs.io/en/latest/readme.html). -- We limited test running time to 2.0 seconds (see: [pytest-timeouts](https://pypi.org/project/pytest-timeouts/)). As such, +- We limited test running time to 2.0 seconds (see: [pytest-timeouts](https://pypi.org/project/pytest-timeouts/)). As such, we recommend using small model parameters and avoiding dynamic imports, file access, and unnecessary loops. If a unit test needs more running time, you can annotate your test with `@pytest.mark.execution_timeout(sec)`. - For test initialization (parameters, modules, etc), you can use pytest fixtures. Refer to [pytest fixtures](https://docs.pytest.org/en/latest/fixture.html#using-fixtures-from-classes-modules-or-projects) for more information. - + ### 4.2 Bash scripts @@ -127,15 +159,15 @@ You can also test the scripts in `utils` with [bats-core](https://github.com/bat To test: ``` console -./ci/test_bash.sh +./ci/test_shell.sh ``` ## 5 Integration testing -Write new integration tests in [ci/test_integration.sh](ci/test_integration.sh) when you add new features in [espnet/bin](espnet/bin). They use our smallest dataset [egs/mini_an4](egs/mini_an4) to test `run.sh`. To make the coverage take them into account, don't forget `--python ${python}` support in your `run.sh` +Write new integration tests in [ci/test_integration_espnet1.sh](ci/test_integration_espnet1.sh) or [ci/test_integration_espnet2.sh](ci/test_integration_espnet2.sh) when you add new features in [espnet/bin](espnet/bin) or [espnet2/bin](espnet2/bin), respectively. They use our smallest dataset [egs/mini_an4](egs/mini_an4) or [egs2/mini_an4](egs/mini_an4) to test `run.sh`. **Don't call `python` directly in integration tests. Instead, use `coverage run --append`** as a python interpreter. Especially, `run.sh` should support `--python ${python}` to call the custom interpreter. ```bash -# ci/integration_test.sh +# ci/test_integration_espnet{1,2}.sh python="coverage run --append" @@ -150,6 +182,7 @@ cd egs/mini_an4/your_task - [.travis.yml](.travis.yml) configures Travis-CI (unittests, doc deploy). - [.circleci/config.yml](.circleci/config.yml) configures Circle-CI (unittests, integration tests). - [.github/workflows](.github/workflows/) configures Github Actions (unittests, integration tests). +- [codecov.yml](codecov.yml) configures CodeCov (code coverage). ## 6 Writing new tools diff --git a/README.md b/README.md index 05d56d86f66..082e5450f78 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,14 @@ # ESPnet: end-to-end speech processing toolkit -|system/pytorch ver.|1.0.1|1.1.0|1.2.0|1.3.1|1.4.0|1.5.1|1.6.0|1.7.1|1.8.0| -| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | -|ubuntu18/python3.8/pip|||||||||[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)| -|ubuntu18/python3.7/pip|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)| -|debian9/python3.6/conda|||||||||[![debian9](https://github.com/espnet/espnet/workflows/debian9/badge.svg)](https://github.com/espnet/espnet/actions?query=workflow%3Adebian9)| -|centos7/python3.6/conda|||||||||[![centos7](https://github.com/espnet/espnet/workflows/centos7/badge.svg)](https://github.com/espnet/espnet/actions?query=workflow%3Acentos7)| -|[docs/coverage] python3.8|||||||||[![Build Status](https://travis-ci.org/espnet/espnet.svg?branch=master)](https://travis-ci.org/espnet/espnet)| +| system/pytorch ver. | 1.4.0 | 1.5.1 | 1.6.0 | 1.7.1 | 1.8.1 | 1.9.1 | 1.10.1 | +| :---------------------: | :--------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------: | +| ubuntu20/python3.9/pip | | | | | | | [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions) | +| ubuntu20/python3.8/pip | | | | | | | [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions) | +| ubuntu18/python3.7/pip | [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions) | [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions) | [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions) | [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions) | [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions) | [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions) | [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions) | +| debian9/python3.7/conda | | | | | | | [![debian9](https://github.com/espnet/espnet/workflows/debian9/badge.svg)](https://github.com/espnet/espnet/actions?query=workflow%3Adebian9) | +| centos7/python3.7/conda | | | | | | | [![centos7](https://github.com/espnet/espnet/workflows/centos7/badge.svg)](https://github.com/espnet/espnet/actions?query=workflow%3Acentos7) | +| doc/python3.8 | | | | | | | [![doc](https://github.com/espnet/espnet/workflows/doc/badge.svg)](https://github.com/espnet/espnet/actions?query=workflow%3Adoc) | [![PyPI version](https://badge.fury.io/py/espnet.svg)](https://badge.fury.io/py/espnet) [![Python Versions](https://img.shields.io/pypi/pyversions/espnet.svg)](https://pypi.org/project/espnet/) @@ -26,53 +27,97 @@ | [**Notebook**](https://github.com/espnet/notebook) | [**Tutorial (2019)**](https://github.com/espnet/interspeech2019-tutorial) -ESPnet is an end-to-end speech processing toolkit, mainly focuses on end-to-end speech recognition and end-to-end text-to-speech. -ESPnet uses [chainer](https://chainer.org/) and [pytorch](http://pytorch.org/) as a main deep learning engine, -and also follows [Kaldi](http://kaldi-asr.org/) style data processing, feature extraction/format, and recipes to provide a complete setup for speech recognition and other speech processing experiments. - +ESPnet is an end-to-end speech processing toolkit covering end-to-end speech recognition, text-to-speech, speech translation, speech enhancement, speaker diarization, spoken language understanding, and so on. +ESPnet uses [pytorch](http://pytorch.org/) as a deep learning engine and also follows [Kaldi](http://kaldi-asr.org/) style data processing, feature extraction/format, and recipes to provide a complete setup for various speech processing experiments. ## Key Features ### Kaldi style complete recipe - Support numbers of `ASR` recipes (WSJ, Switchboard, CHiME-4/5, Librispeech, TED, CSJ, AMI, HKUST, Voxforge, REVERB, etc.) - Support numbers of `TTS` recipes with a similar manner to the ASR recipe (LJSpeech, LibriTTS, M-AILABS, etc.) - Support numbers of `ST` recipes (Fisher-CallHome Spanish, Libri-trans, IWSLT'18, How2, Must-C, Mboshi-French, etc.) -- Support numbers of `MT` recipes (IWSLT'16, the above ST recipes etc.) -- Support speech separation and recognition recipe (WSJ-2mix) -- Support voice conversion recipe (VCC2020 baseline) (new!) - +- Support numbers of `MT` recipes (IWSLT'14, IWSLT'16, the above ST recipes etc.) +- Support numbers of `SLU` recipes (CATSLU-MAPS, FSC, Grabo, IEMOCAP, JDCINAL, SNIPS, SLURP, SWBD-DA, etc.) +- Support numbers of `SE/SS` recipes (DNS-IS2020, LibriMix, SMS-WSJ, VCTK-noisyreverb, WHAM!, WHAMR!, WSJ-2mix, etc.) +- Support voice conversion recipe (VCC2020 baseline) +- Support speaker diarization recipe (mini_librispeech) ### ASR: Automatic Speech Recognition - **State-of-the-art performance** in several ASR benchmarks (comparable/superior to hybrid DNN/HMM and CTC) - **Hybrid CTC/attention** based end-to-end ASR - Fast/accurate training with CTC/attention multitask training - CTC/attention joint decoding to boost monotonic alignment decoding - - Encoder: VGG-like CNN + BiRNN (LSTM/GRU), sub-sampling BiRNN (LSTM/GRU) or Transformer + - Encoder: VGG-like CNN + BiRNN (LSTM/GRU), sub-sampling BiRNN (LSTM/GRU), Transformer, or conformer - Attention: Dot product, location-aware attention, variants of multihead - Incorporate RNNLM/LSTMLM/TransformerLM/N-gram trained only with text data - Batch GPU decoding +- Data augmentation - **Transducer** based end-to-end ASR - - Available: RNN-based encoder/decoder or custom encoder/decoder w/ supports for Transformer, Conformer, TDNN (encoder) and causal conv1d (decoder) blocks. - - Also support: mixed RNN/Custom encoder-decoder, VGG2L (RNN/Cutom encoder) and various decoding algorithms. + - Architecture: + - RNN-based encoder and decoder. + - Custom encoder and decoder supporting Transformer, Conformer (encoder), 1D Conv / TDNN (encoder) and causal 1D Conv (decoder) blocks. + - VGG2L (RNN/custom encoder) and Conv2D (custom encoder) bottlenecks. + - Search algorithms: + - Greedy search constrained to one emission by timestep. + - Default beam search algorithm [[Graves, 2012]](https://arxiv.org/abs/1211.3711) without prefix search. + - Alignment-Length Synchronous decoding [[Saon et al., 2020]](https://ieeexplore.ieee.org/abstract/document/9053040). + - Time Synchronous Decoding [[Saon et al., 2020]](https://ieeexplore.ieee.org/abstract/document/9053040). + - N-step Constrained beam search modified from [[Kim et al., 2020]](https://arxiv.org/abs/2002.03577). + - modified Adaptive Expansion Search based on [[Kim et al., 2021]](https://ieeexplore.ieee.org/abstract/document/9250505) and NSC. + - Features: + - Multi-task learning with various auxiliary losses: + - Encoder: CTC, auxiliary Transducer and symmetric KL divergence. + - Decoder: cross-entropy w/ label smoothing. + - Transfer learning with acoustic model and/or language model. + - Training with FastEmit regularization method [[Yu et al., 2021]](https://arxiv.org/abs/2010.11148). > Please refer to the [tutorial page](https://espnet.github.io/espnet/tutorial.html#transducer) for complete documentation. - CTC segmentation -- Non-autoregressive based on Mask CTC +- Non-autoregressive model based on Mask-CTC - ASR examples for supporting endangered language documentation (Please refer to egs/puebla_nahuatl and egs/yoloxochitl_mixtec for details) - Wav2Vec2.0 pretrained model as Encoder, imported from [FairSeq](https://github.com/pytorch/fairseq/tree/master/fairseq). +- Self-supervised learning representations as features, using upstream models in [S3PRL](https://github.com/s3prl/s3prl) in frontend. + - Set `frontend` to be `s3prl` + - Select any upstream model by setting the `frontend_conf` to the corresponding name. +- Streaming Transformer/Conformer ASR with blockwise synchronous beam search. +- Restricted Self-Attention based on [Longformer](https://arxiv.org/abs/2004.05150) as an encoder for long sequences + +### SUM: Speech Summarization +- End to End Speech Summarization Recipe for Instructional Videos using Restricted Self-Attention [[Sharma et al., 2022]](https://arxiv.org/abs/2110.06263) + +Demonstration +- Real-time ASR demo with ESPnet2 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_asr_realtime_demo.ipynb) +- [Gradio](https://github.com/gradio-app/gradio) Web Demo on [Huggingface Spaces](https://huggingface.co/docs/hub/spaces). Check out the [Web Demo](https://huggingface.co/spaces/akhaliq/espnet2_asr) +- Streaming Transformer ASR [Local Demo](https://github.com/espnet/notebook/blob/master/espnet2_streaming_asr_demo.ipynb) with ESPnet2. ### TTS: Text-to-speech -- Tacotron2 -- Transformer-TTS -- FastSpeech -- FastSpeech2 (in ESPnet2) -- Conformer-based FastSpeech & FastSpeech2 (in ESPnet2) -- Multi-speaker model with pretrained speaker embedding -- Multi-speaker model with GST (in ESPnet2) -- Phoneme-based training (En, Jp, and Zn) -- Integration with neural vocoders (WaveNet, ParallelWaveGAN, and MelGAN) - -You can try demo online now! +- Architecture + - Tacotron2 + - Transformer-TTS + - FastSpeech + - FastSpeech2 + - Conformer FastSpeech & FastSpeech2 + - VITS +- Multi-speaker & multi-language extention + - Pretrined speaker embedding (e.g., X-vector) + - Speaker ID embedding + - Language ID embedding + - Global style token (GST) embedding + - Mix of the above embeddings +- End-to-end training + - End-to-end text-to-wav model (e.g., VITS) + - Joint training of text2mel and vocoder +- Various language support + - En / Jp / Zn / De / Ru / And more... +- Integration with neural vocoders + - Parallel WaveGAN + - MelGAN + - Multi-band MelGAN + - HiFiGAN + - StyleMelGAN + - Mix of the above models + +Demonstration - Real-time TTS demo with ESPnet2 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_tts_realtime_demo.ipynb) -- Real-time TTS demo with ESPnet1 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/tts_realtime_demo.ipynb) +- Integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See demo: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/akhaliq/ESPnet2-TTS) To train the neural vocoder, please check the following repositories: - [kan-bayashi/ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN) @@ -82,6 +127,21 @@ To train the neural vocoder, please check the following repositories: > - We are moving on ESPnet2-based development for TTS. > - If you are beginner, we recommend using [ESPnet2-TTS](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1). +### SE: Speech enhancement (and separation) + +- Single-speaker speech enhancement +- Multi-speaker speech separation +- Unified encoder-separator-decoder structure for time-domain and frequency-domain models + - Encoder/Decoder: STFT/iSTFT, Convolution/Transposed-Convolution + - Separators: BLSTM, Transformer, Conformer, [TasNet](https://arxiv.org/abs/1809.07454), [DPRNN](https://arxiv.org/abs/1910.06379), [DC-CRN](https://web.cse.ohio-state.edu/~wang.77/papers/TZW.taslp21.pdf), [DCCRN](https://arxiv.org/abs/2008.00264), Neural Beamformers, etc. +- Flexible ASR integration: working as an individual task or as the ASR frontend +- Easy to import pretrained models from [Asteroid](https://github.com/asteroid-team/asteroid) + - Both the pre-trained models from Asteroid and the specific configuration are supported. + +Demonstration +- Interactive SE demo with ESPnet2 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1fjRJCh96SoYLZPRxsjF9VDv4Q2VoIckI?usp=sharing) + + ### ST: Speech Translation & MT: Machine Translation - **State-of-the-art performance** in several ST benchmarks (comparable/superior to cascaded ASR and MT) - Transformer based end-to-end ST (new!) @@ -91,6 +151,11 @@ To train the neural vocoder, please check the following repositories: - Transformer and Tacotron2 based parallel VC using melspectrogram (new!) - End-to-end VC based on cascaded ASR+TTS (Baseline system for Voice Conversion Challenge 2020!) +### SLU: Speech Language Understanding +- Predicting intent by directly classifying it as one of intent or decoding by character +- Transformer & RNN based encoder-decoder model +- Establish SOTA results with spectral augmentation (Performs better than reported results of pretrained model on Fluent Speech Command Dataset) + ### DNN Framework - Flexible network architecture thanks to chainer and pytorch - Flexible front-end processing thanks to [kaldiio](https://github.com/nttcslab-sp/kaldiio) and HDF5 support @@ -99,13 +164,13 @@ To train the neural vocoder, please check the following repositories: ### ESPnet2 See [ESPnet2](https://espnet.github.io/espnet/espnet2_tutorial.html). -- Indepedent from Kaldi/Chainer, unlike ESPnet1 +- Independent from Kaldi/Chainer, unlike ESPnet1 - On the fly feature extraction and text processing when training - Supporting DistributedDataParallel and DaraParallel both - Supporting multiple nodes training and integrated with [Slurm](https://slurm.schedmd.com/) or MPI - Supporting Sharded Training provided by [fairscale](https://github.com/facebookresearch/fairscale) - A template recipe which can be applied for all corpora -- Possible to train any size of corpus without cpu memory error +- Possible to train any size of corpus without CPU memory error - [ESPnet Model Zoo](https://github.com/espnet/espnet_model_zoo) - Integrated with [wandb](https://espnet.github.io/espnet/espnet2_training_option.html#weights-biases-integration) @@ -113,21 +178,23 @@ See [ESPnet2](https://espnet.github.io/espnet/espnet2_tutorial.html). - If you intend to do full experiments including DNN training, then see [Installation](https://espnet.github.io/espnet/installation.html). - If you just need the Python module only: ```sh + # We recommend you installing pytorch before installing espnet following https://pytorch.org/get-started/locally/ pip install espnet # To install latest # pip install git+https://github.com/espnet/espnet + # To install additional packages + # pip install "espnet[all]" ``` - You need to install some packages. + If you'll use ESPnet1, please install chainer and cupy. ```sh - pip install torch - pip install chainer==6.0.0 cupy==6.0.0 # [Option] If you'll use ESPnet1 - pip install torchaudio # [Option] If you'll use enhancement task - pip install torch_optimizer # [Option] If you'll use additional optimizers in ESPnet2 + pip install chainer==6.0.0 cupy==6.0.0 # [Option] ``` - There are some required packages depending on each task other than above. If you meet ImportError, please intall them at that time. + You might need to install some packages depending on each task. We prepared various installation scripts at [tools/installers](tools/installers). + +- (ESPnet2) Once installed, run `wandb login` and set `--use_wandb true` to enable tracking runs using W&B. ## Usage See [Usage](https://espnet.github.io/espnet/tutorial.html). @@ -137,7 +204,7 @@ See [Usage](https://espnet.github.io/espnet/tutorial.html). go to [docker/](docker/) and follow [instructions](https://espnet.github.io/espnet/docker.html). ## Contribution -Thank you for taking times for ESPnet! Any contributions to ESPNet are welcome and feel free to ask any questions or requests to [issues](https://github.com/espnet/espnet/issues). +Thank you for taking times for ESPnet! Any contributions to ESPnet are welcome and feel free to ask any questions or requests to [issues](https://github.com/espnet/espnet/issues). If it's the first contribution to ESPnet for you, please follow the [contribution guide](CONTRIBUTING.md). ## Results and demo @@ -151,20 +218,22 @@ You can find useful tutorials and demos in [Interspeech 2019 Tutorial](https://g We list the character error rate (CER) and word error rate (WER) of major ASR tasks. -| Task | CER (%) | WER (%) | Pretrained model| -| ----------- | :----: | :----: | :----: | -| Aishell dev/test | 4.6/5.1 | N/A | [link](https://github.com/espnet/espnet/blob/master/egs/aishell/asr1/RESULTS.md#conformer-kernel-size--15--specaugment--lm-weight--00-result) | -| **ESPnet2** Aishell dev/test | 4.4/4.7 | N/A | [link](https://github.com/espnet/espnet/tree/master/egs2/aishell/asr1#conformer--specaug--speed-perturbation-featsraw-n_fft512-hop_length128) | -| Common Voice dev/test | 1.7/1.8 | 2.2/2.3 | [link](https://github.com/espnet/espnet/blob/master/egs/commonvoice/asr1/RESULTS.md#first-results-default-pytorch-transformer-setting-with-bpe-100-epochs-single-gpu) | -| CSJ eval1/eval2/eval3 | 5.7/3.8/4.2 | N/A | [link](https://github.com/espnet/espnet/blob/master/egs/csj/asr1/RESULTS.md#pytorch-backend-transformer-without-any-hyperparameter-tuning) | -| **ESPnet2** CSJ eval1/eval2/eval3 | 4.5/3.3/3.6 | N/A | [link](https://github.com/espnet/espnet/tree/master/egs2/csj/asr1#initial-conformer-results) | -| HKUST dev | 23.5 | N/A | [link](https://github.com/espnet/espnet/blob/master/egs/hkust/asr1/RESULTS.md#transformer-only-20-epochs) | -| Librispeech dev_clean/dev_other/test_clean/test_other | N/A | 1.9/4.9/2.1/4.9 | [link](https://github.com/espnet/espnet/blob/master/egs/librispeech/asr1/RESULTS.md#pytorch-large-conformer-with-specaug--speed-perturbation-8-gpus--transformer-lm-4-gpus) | -| Switchboard (eval2000) callhm/swbd | N/A | 14.0/6.8 | [link](https://github.com/espnet/espnet/blob/master/egs/swbd/asr1/RESULTS.md#conformer-with-bpe-2000-specaug-speed-perturbation-transformer-lm-decoding) | -| TEDLIUM2 dev/test | N/A | 8.6/7.2 | [link](https://github.com/espnet/espnet/blob/master/egs/tedlium2/asr1/RESULTS.md#conformer-large-model--specaug--speed-perturbation--rnnlm) | -| TEDLIUM3 dev/test | N/A | 9.6/7.6 | [link](https://github.com/espnet/espnet/blob/master/egs/tedlium3/asr1/RESULTS.md) | -| WSJ dev93/eval92 | 3.2/2.1 | 7.0/4.7 | N/A | -| **ESPnet2** WSJ dev93/eval92 | 2.7/1.8 | 6.6/4.6 | [link](https://github.com/espnet/espnet/tree/master/egs2/wsj/asr1#using-transformer-lm-asr-model-is-same-as-the-above-lm_weight12-ctc_weight03-beam_size20) | +| Task | CER (%) | WER (%) | Pretrained model | +| ----------------------------------------------------------------- | :-------------: | :-------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Aishell dev/test | 4.6/5.1 | N/A | [link](https://github.com/espnet/espnet/blob/master/egs/aishell/asr1/RESULTS.md#conformer-kernel-size--15--specaugment--lm-weight--00-result) | +| **ESPnet2** Aishell dev/test | 4.4/4.7 | N/A | [link](https://github.com/espnet/espnet/tree/master/egs2/aishell/asr1#conformer--specaug--speed-perturbation-featsraw-n_fft512-hop_length128) | +| Common Voice dev/test | 1.7/1.8 | 2.2/2.3 | [link](https://github.com/espnet/espnet/blob/master/egs/commonvoice/asr1/RESULTS.md#first-results-default-pytorch-transformer-setting-with-bpe-100-epochs-single-gpu) | +| CSJ eval1/eval2/eval3 | 5.7/3.8/4.2 | N/A | [link](https://github.com/espnet/espnet/blob/master/egs/csj/asr1/RESULTS.md#pytorch-backend-transformer-without-any-hyperparameter-tuning) | +| **ESPnet2** CSJ eval1/eval2/eval3 | 4.5/3.3/3.6 | N/A | [link](https://github.com/espnet/espnet/tree/master/egs2/csj/asr1#initial-conformer-results) | +| HKUST dev | 23.5 | N/A | [link](https://github.com/espnet/espnet/blob/master/egs/hkust/asr1/RESULTS.md#transformer-only-20-epochs) | +| **ESPnet2** HKUST dev | 21.2 | N/A | [link](https://github.com/espnet/espnet/tree/master/egs2/hkust/asr1#transformer-asr--transformer-lm) | +| Librispeech dev_clean/dev_other/test_clean/test_other | N/A | 1.9/4.9/2.1/4.9 | [link](https://github.com/espnet/espnet/blob/master/egs/librispeech/asr1/RESULTS.md#pytorch-large-conformer-with-specaug--speed-perturbation-8-gpus--transformer-lm-4-gpus) | +| **ESPnet2** Librispeech dev_clean/dev_other/test_clean/test_other | 0.6/1.5/0.6/1.4 | 1.7/3.4/1.8/3.6 | [link](https://github.com/espnet/espnet/tree/master/egs2/librispeech/asr1#self-supervised-learning-features-hubert_large_ll60k-conformer-utt_mvn-with-transformer-lm) | +| Switchboard (eval2000) callhm/swbd | N/A | 14.0/6.8 | [link](https://github.com/espnet/espnet/blob/master/egs/swbd/asr1/RESULTS.md#conformer-with-bpe-2000-specaug-speed-perturbation-transformer-lm-decoding) | +| TEDLIUM2 dev/test | N/A | 8.6/7.2 | [link](https://github.com/espnet/espnet/blob/master/egs/tedlium2/asr1/RESULTS.md#conformer-large-model--specaug--speed-perturbation--rnnlm) | +| TEDLIUM3 dev/test | N/A | 9.6/7.6 | [link](https://github.com/espnet/espnet/blob/master/egs/tedlium3/asr1/RESULTS.md) | +| WSJ dev93/eval92 | 3.2/2.1 | 7.0/4.7 | N/A | +| **ESPnet2** WSJ dev93/eval92 | 1.1/0.8 | 2.8/1.8 | [link](https://github.com/espnet/espnet/tree/master/egs2/wsj/asr1#self-supervised-learning-features-wav2vec2_large_ll60k-conformer-utt_mvn-with-transformer-lm) | Note that the performance of the CSJ, HKUST, and Librispeech tasks was significantly improved by using the wide network (#units = 1024) and large subword units if necessary reported by [RWTH](https://arxiv.org/pdf/1805.03294.pdf). @@ -191,7 +260,7 @@ The sampling rate must be consistent with that of data used in training. Available pretrained models in the demo script are listed as below. | Model | Notes | -| :------ | :------ | +| :----------------------------------------------------------------------------------------------- | :--------------------------------------------------------- | | [tedlium2.rnn.v1](https://drive.google.com/open?id=1UqIY6WJMZ4sxNxSugUqp3mrGb3j6h7xe) | Streaming decoding based on CTC-based VAD | | [tedlium2.rnn.v2](https://drive.google.com/open?id=1cac5Uc09lJrCYfWkLQsF8eapQcxZnYdf) | Streaming decoding based on CTC-based VAD (batch decoding) | | [tedlium2.transformer.v1](https://drive.google.com/open?id=1cVeSOYY1twOfL9Gns7Z3ZDnkrJqNwPow) | Joint-CTC attention Transformer trained on Tedlium 2 | @@ -203,6 +272,30 @@ Available pretrained models in the demo script are listed as below. +### SE results +
expand
+ +We list results from three different models on WSJ0-2mix, which is one the most widely used benchmark dataset for speech separation. + +| Model | STOI | SAR | SDR | SIR | +| ------------------------------------------------- | ---- | ----- | ----- | ----- | +| [TF Masking](https://zenodo.org/record/4498554) | 0.89 | 11.40 | 10.24 | 18.04 | +| [Conv-Tasnet](https://zenodo.org/record/4498562) | 0.95 | 16.62 | 15.94 | 25.90 | +| [DPRNN-Tasnet](https://zenodo.org/record/4688000) | 0.96 | 18.82 | 18.29 | 28.92 | + +
+ +### SE demos +
expand
+You can try the interactive demo with Google Colab. Please click the following button to get access to the demos. + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1fjRJCh96SoYLZPRxsjF9VDv4Q2VoIckI?usp=sharing) + + +It is based on ESPnet2. Pretrained models are available for both speech enhancement and speech separation tasks. + +
+ ### ST results
expand
@@ -210,23 +303,23 @@ Available pretrained models in the demo script are listed as below. We list 4-gram BLEU of major ST tasks. #### end-to-end system -| Task | BLEU | Pretrained model | -| ---- | :----: | :----: | +| Task | BLEU | Pretrained model | +| ------------------------------------------------- | :---: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | | Fisher-CallHome Spanish fisher_test (Es->En) | 51.03 | [link](https://github.com/espnet/espnet/blob/master/egs/fisher_callhome_spanish/st1/RESULTS.md#train_spen_lcrm_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans) | | Fisher-CallHome Spanish callhome_evltest (Es->En) | 20.44 | [link](https://github.com/espnet/espnet/blob/master/egs/fisher_callhome_spanish/st1/RESULTS.md#train_spen_lcrm_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans) | -| Libri-trans test (En->Fr) | 16.70 | [link](https://github.com/espnet/espnet/blob/master/egs/libri_trans/st1/RESULTS.md#train_spfr_lc_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans-1) | -| How2 dev5 (En->Pt) | 45.68 | [link](https://github.com/espnet/espnet/blob/master/egs/how2/st1/RESULTS.md#trainpt_tc_pytorch_train_pytorch_transformer_short_long_bpe8000_specaug_asrtrans_mttrans-1) | -| Must-C tst-COMMON (En->De) | 22.91 | [link](https://github.com/espnet/espnet/blob/master/egs/must_c/st1/RESULTS.md#train_spen-dede_tc_pytorch_train_pytorch_transformer_short_long_bpe8000_specaug_asrtrans_mttrans) | -| Mboshi-French dev (Fr->Mboshi) | 6.18 | N/A | +| Libri-trans test (En->Fr) | 16.70 | [link](https://github.com/espnet/espnet/blob/master/egs/libri_trans/st1/RESULTS.md#train_spfr_lc_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans-1) | +| How2 dev5 (En->Pt) | 45.68 | [link](https://github.com/espnet/espnet/blob/master/egs/how2/st1/RESULTS.md#trainpt_tc_pytorch_train_pytorch_transformer_short_long_bpe8000_specaug_asrtrans_mttrans-1) | +| Must-C tst-COMMON (En->De) | 22.91 | [link](https://github.com/espnet/espnet/blob/master/egs/must_c/st1/RESULTS.md#train_spen-dede_tc_pytorch_train_pytorch_transformer_short_long_bpe8000_specaug_asrtrans_mttrans) | +| Mboshi-French dev (Fr->Mboshi) | 6.18 | N/A | #### cascaded system -| Task | BLEU | Pretrained model | -| ---- | :----: | :----: | -| Fisher-CallHome Spanish fisher_test (Es->En) | 42.16 | N/A | -| Fisher-CallHome Spanish callhome_evltest (Es->En) | 19.82 | N/A | -| Libri-trans test (En->Fr) | 16.96 | N/A | -| How2 dev5 (En->Pt) | 44.90 | N/A | -| Must-C tst-COMMON (En->De) | 23.65 | N/A | +| Task | BLEU | Pretrained model | +| ------------------------------------------------- | :---: | :--------------: | +| Fisher-CallHome Spanish fisher_test (Es->En) | 42.16 | N/A | +| Fisher-CallHome Spanish callhome_evltest (Es->En) | 19.82 | N/A | +| Libri-trans test (En->Fr) | 16.96 | N/A | +| How2 dev5 (En->Pt) | 44.90 | N/A | +| Must-C tst-COMMON (En->De) | 23.65 | N/A | If you want to check the results of the other recipes, please check `egs//st1/RESULTS.md`. @@ -259,9 +352,9 @@ The sampling rate must be consistent with that of data used in training. Available pretrained models in the demo script are listed as below. -| Model | Notes | -| :------ | :------ | -| [fisher_callhome_spanish.transformer.v1](https://drive.google.com/open?id=1hawp5ZLw4_SIHIT3edglxbKIIkPVe8n3) | Transformer-ST trained on Fisher-CallHome Spanish Es->En | +| Model | Notes | +| :----------------------------------------------------------------------------------------------------------- | :------------------------------------------------------- | +| [fisher_callhome_spanish.transformer.v1](https://drive.google.com/open?id=1hawp5ZLw4_SIHIT3edglxbKIIkPVe8n3) | Transformer-ST trained on Fisher-CallHome Spanish Es->En |
@@ -270,17 +363,18 @@ Available pretrained models in the demo script are listed as below.
expand
-| Task | BLEU | Pretrained model | -| ---- | :----: | :----: | +| Task | BLEU | Pretrained model | +| ------------------------------------------------- | :---: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------: | | Fisher-CallHome Spanish fisher_test (Es->En) | 61.45 | [link](https://github.com/espnet/espnet/blob/master/egs/fisher_callhome_spanish/mt1/RESULTS.md#trainen_lcrm_lcrm_pytorch_train_pytorch_transformer_bpe_bpe1000) | | Fisher-CallHome Spanish callhome_evltest (Es->En) | 29.86 | [link](https://github.com/espnet/espnet/blob/master/egs/fisher_callhome_spanish/mt1/RESULTS.md#trainen_lcrm_lcrm_pytorch_train_pytorch_transformer_bpe_bpe1000) | -| Libri-trans test (En->Fr) | 18.09 | [link](https://github.com/espnet/espnet/blob/master/egs/libri_trans/mt1/RESULTS.md#trainfr_lcrm_tc_pytorch_train_pytorch_transformer_bpe1000) | -| How2 dev5 (En->Pt) | 58.61 | [link](https://github.com/espnet/espnet/blob/master/egs/how2/mt1/RESULTS.md#trainpt_tc_tc_pytorch_train_pytorch_transformer_bpe8000) | -| Must-C tst-COMMON (En->De) | 27.63 | [link](https://github.com/espnet/espnet/blob/master/egs/must_c/mt1/RESULTS.md#summary-4-gram-bleu) | -| IWSLT'14 test2014 (En->De) | 24.70 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) | -| IWSLT'14 test2014 (De->En) | 29.22 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) | -| IWSLT'16 test2014 (En->De) | 24.05 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) | -| IWSLT'16 test2014 (De->En) | 29.13 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) | +| Libri-trans test (En->Fr) | 18.09 | [link](https://github.com/espnet/espnet/blob/master/egs/libri_trans/mt1/RESULTS.md#trainfr_lcrm_tc_pytorch_train_pytorch_transformer_bpe1000) | +| How2 dev5 (En->Pt) | 58.61 | [link](https://github.com/espnet/espnet/blob/master/egs/how2/mt1/RESULTS.md#trainpt_tc_tc_pytorch_train_pytorch_transformer_bpe8000) | +| Must-C tst-COMMON (En->De) | 27.63 | [link](https://github.com/espnet/espnet/blob/master/egs/must_c/mt1/RESULTS.md#summary-4-gram-bleu) | +| IWSLT'14 test2014 (En->De) | 24.70 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) | +| IWSLT'14 test2014 (De->En) | 29.22 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) | +| IWSLT'14 test2014 (De->En) | 32.2 | [link](https://github.com/espnet/espnet/blob/master/egs2/iwslt14/mt1/README.md) | +| IWSLT'16 test2014 (En->De) | 24.05 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) | +| IWSLT'16 test2014 (De->En) | 29.13 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) |
@@ -288,7 +382,7 @@ Available pretrained models in the demo script are listed as below.
ESPnet2
-You can listen to the generated samples in the following url. +You can listen to the generated samples in the following URL. - [ESPnet2 TTS generated samples](https://drive.google.com/drive/folders/1H3fnlBbWMEkQUfrHqosKN_ZX_WjO29ma?usp=sharing) > Note that in the generation we use Griffin-Lim (`wav/`) and [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN) (`wav_pwg/`). @@ -340,19 +434,19 @@ If you want to build your own neural vocoder, please check the above repositorie Here we list all of the pretrained neural vocoders. Please download and enjoy the generation of high quality speech! | Model link | Lang | Fs [Hz] | Mel range [Hz] | FFT / Shift / Win [pt] | Model type | -| :------ | :---: | :----: | :--------: | :---------------: | :------ | -| [ljspeech.wavenet.softmax.ns.v1](https://drive.google.com/open?id=1eA1VcRS9jzFa-DovyTgJLQ_jmwOLIi8L) | EN | 22.05k | None | 1024 / 256 / None | [Softmax WaveNet](https://github.com/kan-bayashi/PytorchWaveNetVocoder) | -| [ljspeech.wavenet.mol.v1](https://drive.google.com/open?id=1sY7gEUg39QaO1szuN62-Llst9TrFno2t) | EN | 22.05k | None | 1024 / 256 / None | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder) | -| [ljspeech.parallel_wavegan.v1](https://drive.google.com/open?id=1tv9GKyRT4CDsvUWKwH3s_OfXkiTi0gw7) | EN | 22.05k | None | 1024 / 256 / None | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN) | -| [ljspeech.wavenet.mol.v2](https://drive.google.com/open?id=1es2HuKUeKVtEdq6YDtAsLNpqCy4fhIXr) | EN | 22.05k | 80-7600 | 1024 / 256 / None | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder) | -| [ljspeech.parallel_wavegan.v2](https://drive.google.com/open?id=1Grn7X9wD35UcDJ5F7chwdTqTa4U7DeVB) | EN | 22.05k | 80-7600 | 1024 / 256 / None | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN) | -| [ljspeech.melgan.v1](https://drive.google.com/open?id=1ipPWYl8FBNRlBFaKj1-i23eQpW_W_YcR) | EN | 22.05k | 80-7600 | 1024 / 256 / None | [MelGAN](https://github.com/kan-bayashi/ParallelWaveGAN) | -| [ljspeech.melgan.v3](https://drive.google.com/open?id=1_a8faVA5OGCzIcJNw4blQYjfG4oA9VEt) | EN | 22.05k | 80-7600 | 1024 / 256 / None | [MelGAN](https://github.com/kan-bayashi/ParallelWaveGAN) | -| [libritts.wavenet.mol.v1](https://drive.google.com/open?id=1jHUUmQFjWiQGyDd7ZeiCThSjjpbF_B4h) | EN | 24k | None | 1024 / 256 / None | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder) | -| [jsut.wavenet.mol.v1](https://drive.google.com/open?id=187xvyNbmJVZ0EZ1XHCdyjZHTXK9EcfkK) | JP | 24k | 80-7600 | 2048 / 300 / 1200 | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder) | -| [jsut.parallel_wavegan.v1](https://drive.google.com/open?id=1OwrUQzAmvjj1x9cDhnZPp6dqtsEqGEJM) | JP | 24k | 80-7600 | 2048 / 300 / 1200 | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN) | -| [csmsc.wavenet.mol.v1](https://drive.google.com/open?id=1PsjFRV5eUP0HHwBaRYya9smKy5ghXKzj) | ZH | 24k | 80-7600 | 2048 / 300 / 1200 | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder) | -| [csmsc.parallel_wavegan.v1](https://drive.google.com/open?id=10M6H88jEUGbRWBmU1Ff2VaTmOAeL8CEy) | ZH | 24k | 80-7600 | 2048 / 300 / 1200 | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN) | +| :--------------------------------------------------------------------------------------------------- | :---: | :-----: | :------------: | :--------------------: | :---------------------------------------------------------------------- | +| [ljspeech.wavenet.softmax.ns.v1](https://drive.google.com/open?id=1eA1VcRS9jzFa-DovyTgJLQ_jmwOLIi8L) | EN | 22.05k | None | 1024 / 256 / None | [Softmax WaveNet](https://github.com/kan-bayashi/PytorchWaveNetVocoder) | +| [ljspeech.wavenet.mol.v1](https://drive.google.com/open?id=1sY7gEUg39QaO1szuN62-Llst9TrFno2t) | EN | 22.05k | None | 1024 / 256 / None | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder) | +| [ljspeech.parallel_wavegan.v1](https://drive.google.com/open?id=1tv9GKyRT4CDsvUWKwH3s_OfXkiTi0gw7) | EN | 22.05k | None | 1024 / 256 / None | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN) | +| [ljspeech.wavenet.mol.v2](https://drive.google.com/open?id=1es2HuKUeKVtEdq6YDtAsLNpqCy4fhIXr) | EN | 22.05k | 80-7600 | 1024 / 256 / None | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder) | +| [ljspeech.parallel_wavegan.v2](https://drive.google.com/open?id=1Grn7X9wD35UcDJ5F7chwdTqTa4U7DeVB) | EN | 22.05k | 80-7600 | 1024 / 256 / None | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN) | +| [ljspeech.melgan.v1](https://drive.google.com/open?id=1ipPWYl8FBNRlBFaKj1-i23eQpW_W_YcR) | EN | 22.05k | 80-7600 | 1024 / 256 / None | [MelGAN](https://github.com/kan-bayashi/ParallelWaveGAN) | +| [ljspeech.melgan.v3](https://drive.google.com/open?id=1_a8faVA5OGCzIcJNw4blQYjfG4oA9VEt) | EN | 22.05k | 80-7600 | 1024 / 256 / None | [MelGAN](https://github.com/kan-bayashi/ParallelWaveGAN) | +| [libritts.wavenet.mol.v1](https://drive.google.com/open?id=1jHUUmQFjWiQGyDd7ZeiCThSjjpbF_B4h) | EN | 24k | None | 1024 / 256 / None | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder) | +| [jsut.wavenet.mol.v1](https://drive.google.com/open?id=187xvyNbmJVZ0EZ1XHCdyjZHTXK9EcfkK) | JP | 24k | 80-7600 | 2048 / 300 / 1200 | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder) | +| [jsut.parallel_wavegan.v1](https://drive.google.com/open?id=1OwrUQzAmvjj1x9cDhnZPp6dqtsEqGEJM) | JP | 24k | 80-7600 | 2048 / 300 / 1200 | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN) | +| [csmsc.wavenet.mol.v1](https://drive.google.com/open?id=1PsjFRV5eUP0HHwBaRYya9smKy5ghXKzj) | ZH | 24k | 80-7600 | 2048 / 300 / 1200 | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder) | +| [csmsc.parallel_wavegan.v1](https://drive.google.com/open?id=10M6H88jEUGbRWBmU1Ff2VaTmOAeL8CEy) | ZH | 24k | 80-7600 | 2048 / 300 / 1200 | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN) | If you want to use the above pretrained vocoders, please exactly match the feature setting with them. @@ -393,7 +487,7 @@ synth_wav.sh example.txt # also you can use multiple sentences echo "THIS IS A DEMONSTRATION OF TEXT TO SPEECH." > example_multi.txt -echo "TEXT TO SPEECH IS A TECHQNIQUE TO CONVERT TEXT INTO SPEECH." >> example_multi.txt +echo "TEXT TO SPEECH IS A TECHNIQUE TO CONVERT TEXT INTO SPEECH." >> example_multi.txt synth_wav.sh example_multi.txt ``` @@ -434,16 +528,27 @@ The [Voice Conversion Challenge 2020](http://www.vc-challenge.org/) (VCC2020) ad In VCC2020, the objective is intra/cross lingual nonparallel VC. You can download converted samples of the cascade ASR+TTS baseline system [here](https://drive.google.com/drive/folders/1oeZo83GrOgtqxGwF7KagzIrfjr8X59Ue?usp=sharing). +
+ +### SLU results + +
ESPnet2
+ +- Transformer based SLU for Fluent Speech Command Dataset + +In SLU, The objective is to infer the meaning or intent of spoken utterance. The [Fluent Speech Command Dataset](https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/) describes an intent as combination of 3 slot values: action, object and location. You can see baseline results on this dataset [here](https://github.com/espnet/espnet/blob/master/egs2/fsc/asr1/RESULTS.md) + +
### CTC Segmentation demo -
expand
+
ESPnet1
[CTC segmentation](https://arxiv.org/abs/2007.09127) determines utterance segments within audio files. Aligned utterance segments constitute the labels of speech datasets. -As demo, we align start and end of utterances within the audio file `ctc_align_test.wav`, using the example script `utils/ctc_align_wav.sh`. +As demo, we align start and end of utterances within the audio file `ctc_align_test.wav`, using the example script `utils/asr_align_wav.sh`. For preparation, set up a data directory: ```sh @@ -497,14 +602,95 @@ A full example recipe is in `egs/tedlium2/align1/`.
+
ESPnet2
+ +[CTC segmentation](https://arxiv.org/abs/2007.09127) determines utterance segments within audio files. +Aligned utterance segments constitute the labels of speech datasets. + +As demo, we align start and end of utterances within the audio file `ctc_align_test.wav`. +This can be done either directly from the Python command line or using the script `espnet2/bin/asr_align.py`. + +From the Python command line interface: + +```python +# load a model with character tokens +from espnet_model_zoo.downloader import ModelDownloader +d = ModelDownloader(cachedir="./modelcache") +wsjmodel = d.download_and_unpack("kamo-naoyuki/wsj") +# load the example file included in the ESPnet repository +import soundfile +speech, rate = soundfile.read("./test_utils/ctc_align_test.wav") +# CTC segmentation +from espnet2.bin.asr_align import CTCSegmentation +aligner = CTCSegmentation( **wsjmodel , fs=rate ) +text = """ +utt1 THE SALE OF THE HOTELS +utt2 IS PART OF HOLIDAY'S STRATEGY +utt3 TO SELL OFF ASSETS +utt4 AND CONCENTRATE ON PROPERTY MANAGEMENT +""" +segments = aligner(speech, text) +print(segments) +# utt1 utt 0.26 1.73 -0.0154 THE SALE OF THE HOTELS +# utt2 utt 1.73 3.19 -0.7674 IS PART OF HOLIDAY'S STRATEGY +# utt3 utt 3.19 4.20 -0.7433 TO SELL OFF ASSETS +# utt4 utt 4.20 6.10 -0.4899 AND CONCENTRATE ON PROPERTY MANAGEMENT +``` + +Aligning also works with fragments of the text. +For this, set the `gratis_blank` option that allows skipping unrelated audio sections without penalty. +It's also possible to omit the utterance names at the beginning of each line, by setting `kaldi_style_text` to False. + +```python +aligner.set_config( gratis_blank=True, kaldi_style_text=False ) +text = ["SALE OF THE HOTELS", "PROPERTY MANAGEMENT"] +segments = aligner(speech, text) +print(segments) +# utt_0000 utt 0.37 1.72 -2.0651 SALE OF THE HOTELS +# utt_0001 utt 4.70 6.10 -5.0566 PROPERTY MANAGEMENT +``` + +The script `espnet2/bin/asr_align.py` uses a similar interface. To align utterances: -## References +```sh +# ASR model and config files from pretrained model (e.g. from cachedir): +asr_config=/config.yaml +asr_model=/valid.*best.pth +# prepare the text file +wav="test_utils/ctc_align_test.wav" +text="test_utils/ctc_align_text.txt" +cat << EOF > ${text} +utt1 THE SALE OF THE HOTELS +utt2 IS PART OF HOLIDAY'S STRATEGY +utt3 TO SELL OFF ASSETS +utt4 AND CONCENTRATE +utt5 ON PROPERTY MANAGEMENT +EOF +# obtain alignments: +python espnet2/bin/asr_align.py --asr_train_config ${asr_config} --asr_model_file ${asr_model} --audio ${wav} --text ${text} +# utt1 ctc_align_test 0.26 1.73 -0.0154 THE SALE OF THE HOTELS +# utt2 ctc_align_test 1.73 3.19 -0.7674 IS PART OF HOLIDAY'S STRATEGY +# utt3 ctc_align_test 3.19 4.20 -0.7433 TO SELL OFF ASSETS +# utt4 ctc_align_test 4.20 4.97 -0.6017 AND CONCENTRATE +# utt5 ctc_align_test 4.97 6.10 -0.3477 ON PROPERTY MANAGEMENT +``` + +The output of the script can be redirected to a `segments` file by adding the argument `--output segments`. +Each line contains file/utterance name, utterance start and end times in seconds and a confidence score; optionally also the utterance text. +The confidence score is a probability in log space that indicates how good the utterance was aligned. If needed, remove bad utterances: -[1] Shinji Watanabe, Takaaki Hori, Shigeki Karita, Tomoki Hayashi, Jiro Nishitoba, Yuya Unno, Nelson Enrique Yalta Soplin, Jahn Heymann, Matthew Wiesner, Nanxin Chen, Adithya Renduchintala, and Tsubasa Ochiai, "ESPnet: End-to-End Speech Processing Toolkit," *Proc. Interspeech'18*, pp. 2207-2211 (2018) +```sh +min_confidence_score=-7 +# here, we assume that the output was written to the file `segments` +awk -v ms=${min_confidence_score} '{ if ($5 > ms) {print} }' segments +``` -[2] Suyoun Kim, Takaaki Hori, and Shinji Watanabe, "Joint CTC-attention based end-to-end speech recognition using multi-task learning," *Proc. ICASSP'17*, pp. 4835--4839 (2017) +See the module documentation for more information. +It is recommended to use models with RNN-based encoders (such as BLSTMP) for aligning large audio files; +rather than using Transformer models that have a high memory consumption on longer audio data. +The sample rate of the audio must be consistent with that of the data used in training; adjust with `sox` if needed. -[3] Shinji Watanabe, Takaaki Hori, Suyoun Kim, John R. Hershey and Tomoki Hayashi, "Hybrid CTC/Attention Architecture for End-to-End Speech Recognition," *IEEE Journal of Selected Topics in Signal Processing*, vol. 11, no. 8, pp. 1240-1253, Dec. 2017 +
## Citations @@ -551,4 +737,10 @@ A full example recipe is in `egs/tedlium2/align1/`. year={2021}, organization={IEEE}, } +@article{arora2021espnet, + title={ESPnet-SLU: Advancing Spoken Language Understanding through ESPnet}, + author={Arora, Siddhant and Dalmia, Siddharth and Denisov, Pavel and Chang, Xuankai and Ueda, Yushi and Peng, Yifan and Zhang, Yuekai and Kumar, Sujay and Ganesan, Karthik and Yan, Brian and others}, + journal={arXiv preprint arXiv:2111.14706}, + year={2021} +} ``` diff --git a/ci/doc.sh b/ci/doc.sh index d1f36bdfd13..114bc92b952 100755 --- a/ci/doc.sh +++ b/ci/doc.sh @@ -26,11 +26,13 @@ set -euo pipefail find ./utils/{*.sh,spm_*} -exec ./doc/usage2rst.sh {} \; | tee ./doc/_gen/utils_sh.rst find ./espnet2/bin/*.py -exec ./doc/usage2rst.sh {} \; | tee ./doc/_gen/espnet2_bin.rst +./doc/notebook2rst.sh > ./doc/_gen/notebooks.rst + # generate package doc ./doc/module2rst.py --root espnet espnet2 --dst ./doc --exclude espnet.bin # build html -travis-sphinx build --source=doc --nowarn +# TODO(karita): add -W to turn warnings into errors +sphinx-build -b html doc doc/build touch doc/build/.nojekyll - diff --git a/ci/install.sh b/ci/install.sh index 90d7b92d567..5bfed7584ad 100755 --- a/ci/install.sh +++ b/ci/install.sh @@ -14,27 +14,44 @@ ${CXX:-g++} -v mkdir -p kaldi/egs/wsj/s5/utils && touch kaldi/egs/wsj/s5/utils/parse_options.sh if ${USE_CONDA}; then ./setup_anaconda.sh venv espnet ${ESPNET_PYTHON_VERSION} + # To install via pip instead of conda else - ./setup_python.sh "$(command -v python3)" venv + ./setup_venv.sh "$(command -v python3)" venv fi . ./activate_python.sh make TH_VERSION="${TH_VERSION}" - make warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq pyopenjtalk.done py3mmseg.done + make warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done longformer.done rm -rf kaldi ) . tools/activate_python.sh python3 --version -pip3 install https://github.com/kpu/kenlm/archive/master.zip +python3 -m pip install https://github.com/kpu/kenlm/archive/master.zip +# NOTE(kamo): tensorboardx is used for chainer mode only +python3 -m pip install tensorboardx +# NOTE(kamo): Create matplotlib.cache to reduce runtime for test phase +python3 -c "import matplotlib.pyplot" # NOTE(kan-bayashi): Fix the error in black installation. # See: https://github.com/psf/black/issues/1707 -pip3 uninstall -y typing +python3 -m pip uninstall -y typing # install espnet -pip3 install -e ".[test]" -pip3 install -e ".[doc]" +python3 -m pip install -e ".[test]" +python3 -m pip install -e ".[doc]" # log -pip3 freeze +python3 -m pip freeze + + +# Check pytorch version +python3 <= L(next_version): + raise RuntimeError(f"Pytorch=$TH_VERSION is expected, but got pytorch={torch.__version__}. This is a bug in installation scripts") +EOF diff --git a/ci/test_import_all.py b/ci/test_import_all.py new file mode 100755 index 00000000000..e8621bf9340 --- /dev/null +++ b/ci/test_import_all.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +import glob +import importlib +import sys + +try: + import k2 +except Exception: + has_k2 = False +else: + has_k2 = True +try: + import mir_eval +except Exception: + has_mir_eval = False +else: + has_mir_eval = True + + +for dirname in ["espnet", "espnet2"]: + for f in glob.glob(f"{dirname}/**/*.py"): + module_name = f.replace("/", ".")[:-3] + + if ( + ( + not has_k2 + and ( + module_name == "espnet2.bin.asr_inference_k2" + or module_name == "espnet2.fst.lm_rescore" + ) + ) + or (not has_mir_eval and module_name == "espnet2.bin.enh_scoring") + or module_name == "espnet2.tasks.enh_asr" + ): + print(f"[Skip] import {module_name}", file=sys.stderr) + continue + else: + print(f"import {module_name}", file=sys.stderr) + + importlib.import_module(module_name) diff --git a/ci/test_integration.sh b/ci/test_integration.sh deleted file mode 100755 index a1b763489ab..00000000000 --- a/ci/test_integration.sh +++ /dev/null @@ -1,269 +0,0 @@ -#!/usr/bin/env bash - -python="coverage run --append" - -touch .coverage - -# test asr recipe -cwd=$(pwd) -cd ./egs/mini_an4/asr1 || exit 1 -ln -sf ${cwd}/.coverage . -. path.sh # source here to avoid undefined variable errors - -set -euo pipefail - -echo "==== ASR (backend=pytorch lm=RNNLM) ===" -./run.sh --python "${python}" -echo "==== ASR (backend=pytorch, lm=TransformerLM) ===" -./run.sh --python "${python}" --stage 3 --stop-stage 3 --lm-config conf/lm_transformer.yaml --decode-config "$(change_yaml.py conf/decode.yaml -a api=v2)" -# skip duplicated ASR training stage 4 -./run.sh --python "${python}" --stage 5 --lm-config conf/lm_transformer.yaml --decode-config "$(change_yaml.py conf/decode.yaml -a api=v2)" -echo "==== ASR (backend=pytorch, dtype=float64) ===" -./run.sh --python "${python}" --stage 3 --train-config "$(change_yaml.py conf/train.yaml -a train-dtype=float64)" --decode-config "$(change_yaml.py conf/decode.yaml -a api=v2 -a dtype=float64)" -echo "==== ASR (backend=chainer) ===" -./run.sh --python "${python}" --stage 3 --backend chainer - -# skip duplicated ASR training stage 2,3 -# test rnn recipe -echo "=== ASR (backend=pytorch, model=rnn-pure-ctc) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_pure_ctc.yaml \ - --decode-config conf/decode_pure_ctc.yaml -echo "=== ASR (backend=pytorch, model=rnn-no-ctc) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_no_ctc.yaml \ - --decode-config conf/decode_no_ctc.yaml - -# test transformer recipe -echo "=== ASR (backend=pytorch, model=transformer) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer.yaml \ - --decode-config conf/decode.yaml -./run.sh --python "${python}" --stage 5 --train-config conf/train_transformer.yaml \ - --decode-config conf/decode.yaml --metric acc -./run.sh --python "${python}" --stage 5 --train-config conf/train_transformer.yaml \ - --decode-config conf/decode.yaml --metric loss -echo "=== ASR (backend=pytorch, model=conformer) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_conformer.yaml \ - --decode-config conf/decode.yaml -echo "=== ASR (backend=pytorch, model=transformer-pure-ctc) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer_pure_ctc.yaml \ - --decode-config conf/decode_pure_ctc.yaml -echo "=== ASR (backend=pytorch, model=conformer-pure-ctc) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_conformer_pure_ctc.yaml \ - --decode-config conf/decode_pure_ctc.yaml -echo "=== ASR (backend=pytorch, model=transformer-no-ctc) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer_no_ctc.yaml \ - --decode-config conf/decode_no_ctc.yaml -echo "=== ASR (backend=pytorch num-encs 2, model=transformer) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer.yaml \ - --decode-config conf/decode.yaml - -# test transducer recipe -echo "=== ASR (backend=pytorch, model=rnnt) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer.yaml \ - --decode-config conf/decode_transducer.yaml -echo "=== ASR (backend=pytorch, model=transformer-transducer) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer_transducer.yaml \ - --decode-config conf/decode_transducer.yaml -echo "=== ASR (backend=pytorch, model=conformer-transducer) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_conformer_transducer.yaml \ - --decode-config conf/decode_transducer.yaml - -# test finetuning -## test transfer learning -echo "=== ASR (backend=pytorch, model=rnnt, transfer_learning=enc) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer_pre_init_enc.yaml \ - --decode-config conf/decode_transducer.yaml -echo "=== ASR (backend=pytorch, model=rnnt, transfer_learning=LM) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer_pre_init_lm.yaml \ - --decode-config conf/decode_transducer.yaml -## to do: cover all tasks + freezing option - -echo "==== ASR (backend=pytorch num-encs 2) ===" -./run.sh --python "${python}" --stage 2 --train-config ./conf/train_mulenc2.yaml --decode-config ./conf/decode_mulenc2.yaml --mulenc true -# Remove generated files in order to reduce the disk usage -rm -rf exp tensorboard dump data -cd ${cwd} || exit 1 - -# test asr_mix recipe -cd ./egs/mini_an4/asr_mix1 || exit 1 -ln -sf ${cwd}/.coverage . - -echo "==== ASR Mix (backend=pytorch, model=rnn) ===" -./run.sh --python "${python}" --train-config conf/train_multispkr.yaml -echo "==== ASR Mix (backend=pytorch, model=transformer) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_multispkr_transformer.yaml -# Remove generated files in order to reduce the disk usage -rm -rf exp tensorboard dump data -cd "${cwd}" || exit 1 - -# test st recipe -cd ./egs/mini_an4/st1 || exit 1 -ln -sf ${cwd}/.coverage . - -echo "==== ST (backend=pytorch) ===" -./run.sh --python "${python}" -echo "==== ST (backend=pytorch asr0.3) ===" -./run.sh --python "${python}" --stage 4 --train_config conf/train_asr0.3.yaml -echo "==== ST (backend=pytorch ctc asr0.3) ===" -./run.sh --python "${python}" --stage 4 --train_config conf/train_ctc_asr0.3.yaml -echo "==== ST (backend=pytorch mt0.3) ===" -./run.sh --python "${python}" --stage 4 --train_config conf/train_mt0.3.yaml -echo "==== ST (backend=pytorch asr0.2 mt0.2) ===" -./run.sh --python "${python}" --stage 4 --train_config conf/train_asr0.2_mt0.2.yaml -echo "==== ST (backend=pytorch, model=transformer) ===" -./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer.yaml -./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \ - --metric acc -./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \ - --metric bleu -./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \ - --metric loss -echo "==== ST (backend=pytorch asr0.3, model=transformer) ===" -./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer_asr0.3.yaml -echo "==== ST (backend=pytorch ctc asr0.3, model=transformer) ===" -./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer_ctc_asr0.3.yaml -echo "==== ST (backend=pytorch mt0.3, model=transformer) ===" -./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer_mt0.3.yaml -echo "==== ST (backend=pytorch asr0.2 mt0.2, model=transformer) ===" -./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer_asr0.2_mt0.2.yaml -echo "==== ST (backend=pytorch asr0.2 mt0.2, model=conformer) ===" -./run.sh --python "${python}" --stage 4 --train_config conf/train_conformer_asr0.2_mt0.2.yaml -# Remove generated files in order to reduce the disk usage -rm -rf exp tensorboard dump data -cd "${cwd}" || exit 1 - -# test mt recipe -cd ./egs/mini_an4/mt1 || exit 1 -ln -sf ${cwd}/.coverage . - -echo "==== MT (backend=pytorch) ===" -./run.sh --python "${python}" -echo "==== MT (backend=pytorch, model=transformer) ===" -./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer.yaml -./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \ - --metric acc -./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \ - --metric bleu -./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \ - --metric loss -# Remove generated files in order to reduce the disk usage -rm -rf exp tensorboard dump data -cd "${cwd}" || exit 1 - -# test tts recipe -cd ./egs/mini_an4/tts1 || exit 1 -ln -sf ${cwd}/.coverage . - -echo "==== TTS (backend=pytorch) ===" -./run.sh --python "${python}" -# Remove generated files in order to reduce the disk usage -rm -rf exp tensorboard dump data -cd "${cwd}" || exit 1 - -echo "=== run integration tests at test_utils ===" - -PATH=$(pwd)/bats-core/bin:$PATH -if ! [ -x "$(command -v bats)" ]; then - echo "=== install bats ===" - git clone https://github.com/bats-core/bats-core.git -fi -bats test_utils/integration_test_*.bats - - -#### Make sure chainer-independent #### -python3 -m pip uninstall -y chainer - -# [ESPnet2] test asr recipe -cd ./egs2/mini_an4/asr1 || exit 1 -ln -sf ${cwd}/.coverage . -echo "==== [ESPnet2] ASR ===" -./run.sh --stage 1 --stop-stage 1 -feats_types="raw fbank_pitch" -token_types="bpe char" -for t in ${feats_types}; do - ./run.sh --stage 2 --stop-stage 4 --feats-type "${t}" --python "${python}" -done -for t in ${token_types}; do - ./run.sh --stage 5 --stop-stage 5 --token-type "${t}" --python "${python}" -done -for t in ${feats_types}; do - for t2 in ${token_types}; do - echo "==== feats_type=${t}, token_types=${t2} ===" - ./run.sh --ngpu 0 --stage 6 --stop-stage 13 --skip-upload false --feats-type "${t}" --token-type "${t2}" \ - --asr-args "--max_epoch=1" --lm-args "--max_epoch=1" --python "${python}" - done -done -# Remove generated files in order to reduce the disk usage -rm -rf exp dump data -cd "${cwd}" || exit 1 - -# [ESPnet2] test tts recipe -cd ./egs2/mini_an4/tts1 || exit 1 -ln -sf ${cwd}/.coverage . -echo "==== [ESPnet2] TTS ===" -./run.sh --stage 1 --stop-stage 1 --python "${python}" -feats_types="raw fbank stft" -for t in ${feats_types}; do - echo "==== feats_type=${t} ===" - ./run.sh --ngpu 0 --stage 2 --stop-stage 8 --skip-upload false --feats-type "${t}" --train-args "--max_epoch 1" --python "${python}" -done -# Remove generated files in order to reduce the disk usage -rm -rf exp dump data -cd "${cwd}" || exit 1 - -# [ESPnet2] test enh recipe -if python -c 'import torch as t; from distutils.version import LooseVersion as L; assert L(t.__version__) >= L("1.2.0")' &> /dev/null; then - cd ./egs2/mini_an4/enh1 || exit 1 - ln -sf ${cwd}/.coverage . - echo "==== [ESPnet2] ENH ===" - ./run.sh --stage 1 --stop-stage 1 --python "${python}" - feats_types="raw" - for t in ${feats_types}; do - echo "==== feats_type=${t} ===" - ./run.sh --ngpu 0 --stage 2 --stop-stage 9 --skip-upload false --feats-type "${t}" --spk-num 1 --enh-args "--max_epoch=1" --python "${python}" - done - # Remove generated files in order to reduce the disk usage - rm -rf exp dump data - cd "${cwd}" || exit 1 -fi - -# [ESPnet2] Validate configuration files -echo "" > dummy_token_list -echo "==== [ESPnet2] Validation configuration files ===" -if python3 -c 'import torch as t; from distutils.version import LooseVersion as L; assert L(t.__version__) >= L("1.6.0")' &> /dev/null; then - for f in egs2/*/asr1/conf/train_asr*.yaml; do - python3 -m espnet2.bin.asr_train --config "${f}" --iterator_type none --dry_run true --output_dir out --token_list dummy_token_list - done - for f in egs2/*/asr1/conf/train_lm*.yaml; do - python3 -m espnet2.bin.lm_train --config "${f}" --iterator_type none --dry_run true --output_dir out --token_list dummy_token_list - done - for f in egs2/*/tts1/conf/train*.yaml; do - python3 -m espnet2.bin.tts_train --config "${f}" --iterator_type none --normalize none --dry_run true --output_dir out --token_list dummy_token_list - done - for f in egs2/*/enh1/conf/train*.yaml; do - python -m espnet2.bin.enh_train --config "${f}" --iterator_type none --dry_run true --output_dir out - done -fi - -# These files must be same each other. -for base in cmd.sh conf/slurm.conf conf/queue.conf conf/pbs.conf; do - file1= - for f in egs2/*/*/"${base}"; do - if [ -z "${file1}" ]; then - file1="${f}" - fi - diff "${file1}" "${f}" || { echo "Error: ${file1} and ${f} differ: To solve: for f in egs2/*/*/${base}; do cp egs2/TEMPLATE/asr1/${base} \${f}; done" ; exit 1; } - done -done - - -echo "==== [ESPnet2] test setup.sh ===" -for d in egs2/TEMPLATE/*; do - if [ -d "${d}" ]; then - d="${d##*/}" - egs2/TEMPLATE/"$d"/setup.sh egs2/test/"${d}" - fi -done -echo "=== report ===" - -coverage report -coverage xml diff --git a/ci/test_integration_espnet1.sh b/ci/test_integration_espnet1.sh new file mode 100755 index 00000000000..d88bac14c56 --- /dev/null +++ b/ci/test_integration_espnet1.sh @@ -0,0 +1,173 @@ +#!/usr/bin/env bash + +python="coverage run --append" + +cwd=$(pwd) + +# test asr recipe +cd ./egs/mini_an4/asr1 || exit 1 +. path.sh # source here to avoid undefined variable errors + +set -euo pipefail + +echo "==== ASR (backend=pytorch lm=RNNLM) ===" +./run.sh --python "${python}" +echo "==== ASR (backend=pytorch, lm=TransformerLM) ===" +./run.sh --python "${python}" --stage 3 --stop-stage 3 --lm-config conf/lm_transformer.yaml --decode-config "$(change_yaml.py conf/decode.yaml -a api=v2)" +# skip duplicated ASR training stage 4 +./run.sh --python "${python}" --stage 5 --lm-config conf/lm_transformer.yaml --decode-config "$(change_yaml.py conf/decode.yaml -a api=v2)" +echo "==== ASR (backend=pytorch, dtype=float64) ===" +./run.sh --python "${python}" --stage 3 --train-config "$(change_yaml.py conf/train.yaml -a train-dtype=float64)" --decode-config "$(change_yaml.py conf/decode.yaml -a api=v2 -a dtype=float64)" +echo "==== ASR (backend=pytorch, quantize-asr-model true, quantize-lm-model true) ===" +./run.sh --python "${python}" --stage 5 --decode-config "$(change_yaml.py conf/decode.yaml -a quantize-asr-model=true -a quantize-lm-model=true)" +echo "==== ASR (backend=pytorch, quantize-asr-model true, quantize-lm-model true api v2) ===" +./run.sh --python "${python}" --stage 5 --decode-config "$(change_yaml.py conf/decode.yaml -a quantize-asr-model=true -a quantize-lm-model=true -a quantize-config=['Linear'] -a api=v2)" + +echo "==== ASR (backend=chainer) ===" +./run.sh --python "${python}" --stage 3 --backend chainer + +# skip duplicated ASR training stage 2,3 +# test rnn recipe +echo "=== ASR (backend=pytorch, model=rnn-pure-ctc) ===" +./run.sh --python "${python}" --stage 4 --train-config conf/train_pure_ctc.yaml \ + --decode-config conf/decode_pure_ctc.yaml +echo "=== ASR (backend=pytorch, model=rnn-no-ctc) ===" +./run.sh --python "${python}" --stage 4 --train-config conf/train_no_ctc.yaml \ + --decode-config conf/decode_no_ctc.yaml + +# test transformer recipe +echo "=== ASR (backend=pytorch, model=transformer) ===" +./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer.yaml \ + --decode-config conf/decode.yaml +./run.sh --python "${python}" --stage 5 --train-config conf/train_transformer.yaml \ + --decode-config conf/decode.yaml --metric acc +./run.sh --python "${python}" --stage 5 --train-config conf/train_transformer.yaml \ + --decode-config conf/decode.yaml --metric loss +echo "=== ASR (backend=pytorch, model=conformer) ===" +./run.sh --python "${python}" --stage 4 --train-config conf/train_conformer.yaml \ + --decode-config conf/decode.yaml +echo "=== ASR (backend=pytorch, model=transformer-pure-ctc) ===" +./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer_pure_ctc.yaml \ + --decode-config conf/decode_pure_ctc.yaml +echo "=== ASR (backend=pytorch, model=conformer-pure-ctc) ===" +./run.sh --python "${python}" --stage 4 --train-config conf/train_conformer_pure_ctc.yaml \ + --decode-config conf/decode_pure_ctc.yaml +echo "=== ASR (backend=pytorch, model=transformer-no-ctc) ===" +./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer_no_ctc.yaml \ + --decode-config conf/decode_no_ctc.yaml +echo "=== ASR (backend=pytorch num-encs 2, model=transformer) ===" +./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer.yaml \ + --decode-config conf/decode.yaml + +# test transducer recipe +echo "=== ASR (backend=pytorch, model=rnnt) ===" +./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer.yaml \ + --decode-config conf/decode_transducer.yaml +echo "=== ASR (backend=pytorch, model=transformer-transducer) ===" +./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer_transducer.yaml \ + --decode-config conf/decode_transducer.yaml +echo "=== ASR (backend=pytorch, model=conformer-transducer) ===" +./run.sh --python "${python}" --stage 4 --train-config conf/train_conformer_transducer.yaml \ + --decode-config conf/decode_transducer.yaml + +# test transducer with auxiliary task recipe +echo "=== ASR (backend=pytorch, model=rnnt, tasks=L1+L2+L3+L4+L5)" +./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer_aux.yaml \ + --decode-config conf/decode_transducer.yaml +echo "=== ASR (backend=pytorch, model=conformer-transducer, tasks=L1+L2+L5) ===" +./run.sh --python "${python}" --stage 4 --train-config conf/train_conformer_transducer_aux.yaml \ + --decode-config conf/decode_transducer.yaml + +# test finetuning +## test transfer learning +echo "=== ASR (backend=pytorch, model=rnnt, transfer_learning=enc) ===" +./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer_pre_init_enc.yaml \ + --decode-config conf/decode_transducer.yaml +echo "=== ASR (backend=pytorch, model=rnnt, transfer_learning=LM) ===" +./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer_pre_init_lm.yaml \ + --decode-config conf/decode_transducer.yaml +## to do: cover all tasks + freezing option + +echo "==== ASR (backend=pytorch num-encs 2) ===" +./run.sh --python "${python}" --stage 2 --train-config ./conf/train_mulenc2.yaml --decode-config ./conf/decode_mulenc2.yaml --mulenc true +# Remove generated files in order to reduce the disk usage +rm -rf exp tensorboard dump data +cd ${cwd} || exit 1 + +# test asr_mix recipe +cd ./egs/mini_an4/asr_mix1 || exit 1 + +echo "==== ASR Mix (backend=pytorch, model=rnn) ===" +./run.sh --python "${python}" --train-config conf/train_multispkr.yaml +echo "==== ASR Mix (backend=pytorch, model=transformer) ===" +./run.sh --python "${python}" --stage 4 --train-config conf/train_multispkr_transformer.yaml +# Remove generated files in order to reduce the disk usage +rm -rf exp tensorboard dump data +cd "${cwd}" || exit 1 + +# test st recipe +cd ./egs/mini_an4/st1 || exit 1 + +echo "==== ST (backend=pytorch) ===" +./run.sh --python "${python}" +echo "==== ST (backend=pytorch asr0.3) ===" +./run.sh --python "${python}" --stage 4 --train_config conf/train_asr0.3.yaml +echo "==== ST (backend=pytorch ctc asr0.3) ===" +./run.sh --python "${python}" --stage 4 --train_config conf/train_ctc_asr0.3.yaml +echo "==== ST (backend=pytorch mt0.3) ===" +./run.sh --python "${python}" --stage 4 --train_config conf/train_mt0.3.yaml +echo "==== ST (backend=pytorch asr0.2 mt0.2) ===" +./run.sh --python "${python}" --stage 4 --train_config conf/train_asr0.2_mt0.2.yaml +echo "==== ST (backend=pytorch, model=transformer) ===" +./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer.yaml +./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \ + --metric acc +./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \ + --metric bleu +./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \ + --metric loss +echo "==== ST (backend=pytorch asr0.3, model=transformer) ===" +./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer_asr0.3.yaml +echo "==== ST (backend=pytorch ctc asr0.3, model=transformer) ===" +./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer_ctc_asr0.3.yaml +echo "==== ST (backend=pytorch mt0.3, model=transformer) ===" +./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer_mt0.3.yaml +echo "==== ST (backend=pytorch asr0.2 mt0.2, model=transformer) ===" +./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer_asr0.2_mt0.2.yaml +echo "==== ST (backend=pytorch asr0.2 mt0.2, model=conformer) ===" +./run.sh --python "${python}" --stage 4 --train_config conf/train_conformer_asr0.2_mt0.2.yaml +# Remove generated files in order to reduce the disk usage +rm -rf exp tensorboard dump data +cd "${cwd}" || exit 1 + +# test mt recipe +cd ./egs/mini_an4/mt1 || exit 1 + +echo "==== MT (backend=pytorch) ===" +./run.sh --python "${python}" +echo "==== MT (backend=pytorch, model=transformer) ===" +./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer.yaml +./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \ + --metric acc +./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \ + --metric bleu +./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \ + --metric loss +# Remove generated files in order to reduce the disk usage +rm -rf exp tensorboard dump data +cd "${cwd}" || exit 1 + +# test tts recipe +cd ./egs/mini_an4/tts1 || exit 1 + +echo "==== TTS (backend=pytorch) ===" +./run.sh --python "${python}" +# Remove generated files in order to reduce the disk usage +rm -rf exp tensorboard dump data +cd "${cwd}" || exit 1 + +echo "=== report ===" + +coverage combine egs/*/*/.coverage +coverage report +coverage xml diff --git a/ci/test_integration_espnet2.sh b/ci/test_integration_espnet2.sh new file mode 100755 index 00000000000..78086272af7 --- /dev/null +++ b/ci/test_integration_espnet2.sh @@ -0,0 +1,152 @@ +#!/usr/bin/env bash + +set -euo pipefail + +source tools/activate_python.sh +PYTHONPATH="${PYTHONPATH:-}:$(pwd)/tools/s3prl" +export PYTHONPATH +python="coverage run --append" +cwd=$(pwd) + +#### Make sure chainer-independent #### +python3 -m pip uninstall -y chainer + +# [ESPnet2] test asr recipe +cd ./egs2/mini_an4/asr1 +echo "==== [ESPnet2] ASR ===" +./run.sh --stage 1 --stop-stage 1 +feats_types="raw fbank_pitch" +token_types="bpe char" +for t in ${feats_types}; do + ./run.sh --stage 2 --stop-stage 4 --feats-type "${t}" --python "${python}" +done +for t in ${token_types}; do + ./run.sh --stage 5 --stop-stage 5 --token-type "${t}" --python "${python}" +done +for t in ${feats_types}; do + for t2 in ${token_types}; do + echo "==== feats_type=${t}, token_types=${t2} ===" + ./run.sh --ngpu 0 --stage 6 --stop-stage 13 --skip-upload false --feats-type "${t}" --token-type "${t2}" \ + --asr-args "--max_epoch=1" --lm-args "--max_epoch=1" --python "${python}" + done +done +echo "==== feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ===" +./run.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \ + --feats_normalize "utterance_mvn" --lm-args "--max_epoch=1" --python "${python}" \ + --asr-args "--model_conf extract_feats_in_collect_stats=false --max_epoch=1" + +echo "==== use_streaming, feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ===" +./run.sh --use_streaming true --ngpu 0 --stage 6 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \ + --feats_normalize "utterance_mvn" --lm-args "--max_epoch=1" --python "${python}" \ + --asr-args "--model_conf extract_feats_in_collect_stats=false --max_epoch=1 --encoder=contextual_block_transformer --decoder=transformer + --encoder_conf block_size=40 --encoder_conf hop_size=16 --encoder_conf look_ahead=16" + +if python3 -c "import k2" &> /dev/null; then + echo "==== use_k2, num_paths > nll_batch_size, feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ===" + ./run.sh --num_paths 500 --nll_batch_size 20 --use_k2 true --ngpu 0 --stage 12 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \ + --feats_normalize "utterance_mvn" --lm-args "--max_epoch=1" --python "${python}" \ + --asr-args "--model_conf extract_feats_in_collect_stats=false --max_epoch=1" + + echo "==== use_k2, num_paths == nll_batch_size, feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ===" + ./run.sh --num_paths 20 --nll_batch_size 20 --use_k2 true --ngpu 0 --stage 12 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \ + --feats_normalize "utterance_mvn" --lm-args "--max_epoch=1" --python "${python}" \ + --asr-args "--model_conf extract_feats_in_collect_stats=false --max_epoch=1" +fi + +# Remove generated files in order to reduce the disk usage +rm -rf exp dump data +cd "${cwd}" + +# [ESPnet2] test tts recipe +cd ./egs2/mini_an4/tts1 +echo "==== [ESPnet2] TTS ===" +./run.sh --ngpu 0 --stage 1 --stop-stage 8 --skip-upload false --train-args "--max_epoch 1" --python "${python}" +# Remove generated files in order to reduce the disk usage +rm -rf exp dump data + +# [ESPnet2] test gan-tts recipe +# NOTE(kan-bayashi): pytorch 1.4 - 1.6 works but 1.6 has a problem with CPU, +# so we test this recipe using only pytorch > 1.6 here. +# See also: https://github.com/pytorch/pytorch/issues/42446 +if python3 -c 'import torch as t; from distutils.version import LooseVersion as L; assert L(t.__version__) > L("1.6")' &> /dev/null; then + ./run.sh --fs 22050 --tts_task gan_tts --feats_extract linear_spectrogram --feats_normalize none --inference_model latest.pth \ + --ngpu 0 --stop-stage 8 --skip-upload false --train-args "--num_iters_per_epoch 1 --max_epoch 1" --python "${python}" + rm -rf exp dump data +fi +cd "${cwd}" + +# [ESPnet2] test enh recipe +if python -c 'import torch as t; from distutils.version import LooseVersion as L; assert L(t.__version__) >= L("1.2.0")' &> /dev/null; then + cd ./egs2/mini_an4/enh1 + echo "==== [ESPnet2] ENH ===" + ./run.sh --stage 1 --stop-stage 1 --python "${python}" + feats_types="raw" + for t in ${feats_types}; do + echo "==== feats_type=${t} ===" + ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --spk-num 1 --enh-args "--max_epoch=1" --python "${python}" + done + # Remove generated files in order to reduce the disk usage + rm -rf exp dump data + cd "${cwd}" +fi + +# [ESPnet2] test ssl1 recipe +if python3 -c "import fairseq" &> /dev/null; then + cd ./egs2/mini_an4/ssl1 + echo "==== [ESPnet2] SSL1/HUBERT ===" + ./run.sh --ngpu 0 --stage 1 --stop-stage 7 --feats-type "raw" --token_type "word" --skip-upload false --pt-args "--max_epoch=1" --pretrain_start_iter 0 --pretrain_stop_iter 1 --python "${python}" + # Remove generated files in order to reduce the disk usage + rm -rf exp dump data + cd "${cwd}" +fi + +# [ESPnet2] Validate configuration files +echo "" > dummy_token_list +echo "==== [ESPnet2] Validation configuration files ===" +if python3 -c 'import torch as t; from distutils.version import LooseVersion as L; assert L(t.__version__) >= L("1.8.0")' &> /dev/null; then + for f in egs2/*/asr1/conf/train_asr*.yaml; do + if [ "$f" == "egs2/fsc/asr1/conf/train_asr.yaml" ]; then + if ! python3 -c "import s3prl" > /dev/null; then + continue + fi + fi + ${python} -m espnet2.bin.asr_train --config "${f}" --iterator_type none --dry_run true --output_dir out --token_list dummy_token_list + done + for f in egs2/*/asr1/conf/train_lm*.yaml; do + ${python} -m espnet2.bin.lm_train --config "${f}" --iterator_type none --dry_run true --output_dir out --token_list dummy_token_list + done + for f in egs2/*/tts1/conf/train*.yaml; do + ${python} -m espnet2.bin.tts_train --config "${f}" --iterator_type none --normalize none --dry_run true --output_dir out --token_list dummy_token_list + done + for f in egs2/*/enh1/conf/train*.yaml; do + ${python} -m espnet2.bin.enh_train --config "${f}" --iterator_type none --dry_run true --output_dir out + done + for f in egs2/*/ssl1/conf/train*.yaml; do + ${python} -m espnet2.bin.hubert_train --config "${f}" --iterator_type none --normalize none --dry_run true --output_dir out --token_list dummy_token_list + done +fi + +# These files must be same each other. +for base in cmd.sh conf/slurm.conf conf/queue.conf conf/pbs.conf; do + file1= + for f in egs2/*/*/"${base}"; do + if [ -z "${file1}" ]; then + file1="${f}" + fi + diff "${file1}" "${f}" || { echo "Error: ${file1} and ${f} differ: To solve: for f in egs2/*/*/${base}; do cp egs2/TEMPLATE/asr1/${base} \${f}; done" ; exit 1; } + done +done + + +echo "==== [ESPnet2] test setup.sh ===" +for d in egs2/TEMPLATE/*; do + if [ -d "${d}" ]; then + d="${d##*/}" + egs2/TEMPLATE/"$d"/setup.sh egs2/test/"${d}" + fi +done +echo "=== report ===" + +coverage combine egs2/*/*/.coverage +coverage report +coverage xml diff --git a/ci/test_python.sh b/ci/test_python.sh index 2327e083373..b3f47146198 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -1,6 +1,7 @@ #!/usr/bin/env bash . tools/activate_python.sh +. tools/extra_path.sh set -euo pipefail @@ -17,4 +18,9 @@ fi # pycodestyle pycodestyle -r ${modules} --show-source --show-pep8 -LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}:$(pwd)/tools/chainer_ctc/ext/warp-ctc/build" pytest -q +LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}:$(pwd)/tools/chainer_ctc/ext/warp-ctc/build" \ + PYTHONPATH="${PYTHONPATH:-}:$(pwd)/tools/s3prl" pytest -q + +echo "=== report ===" +coverage report +coverage xml diff --git a/ci/test_utils.sh b/ci/test_utils.sh new file mode 100755 index 00000000000..11796606b0e --- /dev/null +++ b/ci/test_utils.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +echo "=== run integration tests at test_utils ===" + +PATH=$(pwd)/bats-core/bin:$PATH +if ! [ -x "$(command -v bats)" ]; then + echo "=== install bats ===" + git clone https://github.com/bats-core/bats-core.git +fi +bats test_utils/integration_test_*.bats + +echo "=== report ===" + +source tools/activate_python.sh +coverage combine egs/*/*/.coverage +coverage report +coverage xml diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 00000000000..c9e226f347d --- /dev/null +++ b/codecov.yml @@ -0,0 +1,10 @@ +# https://docs.codecov.com/docs/common-recipe-list +coverage: + status: + project: + default: + target: auto + # adjust accordingly based on how flaky your tests are + # this allows a 1% drop from the previous base commit coverage + threshold: 1% + informational: true diff --git a/doc/.gitignore b/doc/.gitignore index d4058a5aa91..79f7202744d 100644 --- a/doc/.gitignore +++ b/doc/.gitignore @@ -1,4 +1,4 @@ _gen/ _build/ build/ - +notebook/ \ No newline at end of file diff --git a/doc/README.md b/doc/README.md index 24f4cb6eeee..a316b2998c4 100644 --- a/doc/README.md +++ b/doc/README.md @@ -2,7 +2,7 @@ ## Install -We use [travis-sphinx](https://github.com/Syntaf/travis-sphinx) to generate & deploy HTML documentation. +We use [sphinx](https://www.sphinx-doc.org) to generate HTML documentation. ```sh $ cd @@ -46,8 +46,8 @@ $ cd $ ./ci/doc.sh ``` -open `doc/build/html/index.html` +open `doc/build/index.html` ## Deploy -When your PR is merged into `master` branch, our [Travis-CI](https://github.com/espnet/espnet/blob/master/.travis.yml) will automatically deploy your sphinx html into https://espnet.github.io/espnet/ by `travis-sphinx deploy`. +When your PR is merged into `master` branch, our [CI](https://github.com/espnet/espnet/blob/master/.github/workflows/doc.yml) will automatically deploy your sphinx html into https://espnet.github.io/espnet/. diff --git a/doc/argparse2rst.py b/doc/argparse2rst.py index 790049e0bc9..684673d90a3 100755 --- a/doc/argparse2rst.py +++ b/doc/argparse2rst.py @@ -20,11 +20,16 @@ def __init__(self, path): def get_parser(): parser = configargparse.ArgumentParser( - description='generate RST from argparse options', + description="generate RST from argparse options", config_file_parser_class=configargparse.YAMLConfigFileParser, - formatter_class=configargparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('src', type=str, nargs='+', - help='source python files that contain get_parser() func') + formatter_class=configargparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "src", + type=str, + nargs="+", + help="source python files that contain get_parser() func", + ) return parser @@ -53,7 +58,8 @@ def get_parser(): for m in modinfo: cmd = m.path.name sep = "~" * len(cmd) - print(f""" + print( + f""" .. _{cmd}: @@ -65,4 +71,5 @@ def get_parser(): :func: get_parser :prog: {cmd} -""") +""" + ) diff --git a/doc/conf.py b/doc/conf.py index 32997b08b86..c2f5acd1881 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -20,8 +20,8 @@ import os import sys -sys.path.insert(0, os.path.abspath('../espnet/nets')) -sys.path.insert(0, os.path.abspath('../utils')) +sys.path.insert(0, os.path.abspath("../espnet/nets")) +sys.path.insert(0, os.path.abspath("../utils")) # -- General configuration ------------------------------------------------ @@ -35,8 +35,8 @@ extensions = [ "nbsphinx", "sphinx.ext.autodoc", - 'sphinx.ext.napoleon', - 'sphinx.ext.viewcode', + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", "sphinx.ext.mathjax", "sphinx.ext.todo", "sphinxarg.ext", @@ -44,42 +44,46 @@ ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = '.rst' -source_suffix = ['.rst', '.md'] +source_suffix = [".rst", ".md"] # enable to markdown from recommonmark.parser import CommonMarkParser source_parsers = { - '.md': CommonMarkParser, + ".md": CommonMarkParser, } # AutoStructify setting ref: https://qiita.com/pashango2/items/d1b379b699af85b529ce from recommonmark.transform import AutoStructify -github_doc_root = 'https://github.com/rtfd/recommonmark/tree/master/doc/' +github_doc_root = "https://github.com/rtfd/recommonmark/tree/master/doc/" def setup(app): - app.add_config_value('recommonmark_config', { - 'url_resolver': lambda url: github_doc_root + url, - 'auto_toc_tree_section': 'Contents', - }, True) + app.add_config_value( + "recommonmark_config", + { + "url_resolver": lambda url: github_doc_root + url, + "auto_toc_tree_section": "Contents", + }, + True, + ) app.add_transform(AutoStructify) # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'ESPnet' -copyright = u'2017, Shinji Watanabe' -author = u'Shinji Watanabe' +project = u"ESPnet" +copyright = u"2017, Shinji Watanabe" +author = u"Shinji Watanabe" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -87,6 +91,7 @@ def setup(app): # # The short X.Y version. import espnet + version = espnet.__version__ # The full version, including alpha/beta/rc tags. release = espnet.__version__ @@ -102,18 +107,21 @@ def setup(app): # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path exclude_patterns = [ - '_build', 'Thumbs.db', '.DS_Store', "README.md", - # NOTE: becuase these genearate files are directly included + "_build", + "Thumbs.db", + ".DS_Store", + "README.md", + # NOTE: because these genearate files are directly included # from the other files, we should exclude these files manually. "_gen/modules.rst", "_gen/utils_sh.rst", "_gen/utils_py.rst", "_gen/espnet_bin.rst", - "_gen/espnet-bin.rst" + "_gen/espnet-bin.rst", ] # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -127,7 +135,7 @@ def setup(app): # html_theme = 'nature' import sphinx_rtd_theme -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a theme @@ -147,16 +155,16 @@ def setup(app): # This is required for the alabaster theme # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars html_sidebars = { - '**': [ - 'relations.html', # needs 'show_related': True theme option to display - 'searchbox.html', + "**": [ + "relations.html", # needs 'show_related': True theme option to display + "searchbox.html", ] } # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. -htmlhelp_basename = 'ESPnetdoc' +htmlhelp_basename = "ESPnetdoc" # -- Options for LaTeX output --------------------------------------------- @@ -164,15 +172,12 @@ def setup(app): # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -182,18 +187,14 @@ def setup(app): # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'ESPnet.tex', u'ESPnet Documentation', - u'Shinji Watanabe', 'manual'), + (master_doc, "ESPnet.tex", u"ESPnet Documentation", u"Shinji Watanabe", "manual"), ] # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'espnet', u'ESPnet Documentation', - [author], 1) -] +man_pages = [(master_doc, "espnet", u"ESPnet Documentation", [author], 1)] # -- Options for Texinfo output ------------------------------------------- @@ -201,12 +202,18 @@ def setup(app): # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'ESPnet', u'ESPnet Documentation', - author, 'ESPnet', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "ESPnet", + u"ESPnet Documentation", + author, + "ESPnet", + "One line description of project.", + "Miscellaneous", + ), ] -autoclass_content = 'both' +autoclass_content = "both" # NOTE(kan-bayashi): Do not update outputs in notebook automatically. -nbsphinx_execute = 'never' +nbsphinx_execute = "never" diff --git a/doc/docker.md b/doc/docker.md index a733c1c3594..b9360600626 100644 --- a/doc/docker.md +++ b/doc/docker.md @@ -10,7 +10,7 @@ $ ./run.sh --docker-gpu 0 --docker-egs chime4/asr1 --docker-folders /export/corp Optionally, you can set the CUDA version with the arguments `--docker-cuda` respectively (default version set at CUDA=9.1). The docker container can be built based on the CUDA installed in your computer if you empty this arguments. By default, all GPU-based images are built with NCCL v2 and CUDNN v7. The arguments required for the docker configuration have a prefix "--docker" (e.g., `--docker-gpu`, `--docker-egs`, `--docker-folders`). `run.sh` accept all normal ESPnet arguments, which must be followed by these docker arguments. -All docker containers are executed using the same user as your login account. If you want to run the docker in root access, add the flag `--is-root` to command line. In addition, you can pass any enviroment variable using `--docker-env` (e.g., `--docker-env "foo=path"`) +All docker containers are executed using the same user as your login account. If you want to run the docker in root access, add the flag `--is-root` to command line. In addition, you can pass any environment variable using `--docker-env` (e.g., `--docker-env "foo=path"`) ### ESPnet 2 Recipes diff --git a/doc/espnet2_task.md b/doc/espnet2_task.md index 58a508c1393..af3f3a5d866 100644 --- a/doc/espnet2_task.md +++ b/doc/espnet2_task.md @@ -55,7 +55,7 @@ if __name__ == "__main__": ## Data input system Espnet2 also provides a command line interface to describe the training corpus. -On the contrary, unlike `fairseq` or training system such as `pytorch-lightining`, +On the contrary, unlike `fairseq` or training system such as `pytorch-lightning`, our `Task` class doesn't have an interface for building the dataset explicitly. This is because we aim at the task related to speech/text only, so we don't need such general system so far. diff --git a/doc/espnet2_training_option.md b/doc/espnet2_training_option.md index 173af4a39a1..56c5db07830 100644 --- a/doc/espnet2_training_option.md +++ b/doc/espnet2_training_option.md @@ -326,7 +326,7 @@ and the shape information is required only when `--batch_type numel`. ### `--batch_type folded` -**In ESPnet1, this mode is refered as seq.** +**In ESPnet1, this mode is referred as seq.** This mode creates mini-batch which has the size of `base_batch_size // max_i(1 + L_i // f_i)`. diff --git a/doc/espnet2_tutorial.md b/doc/espnet2_tutorial.md index 3f157992cff..0dd69624a4a 100644 --- a/doc/espnet2_tutorial.md +++ b/doc/espnet2_tutorial.md @@ -18,6 +18,7 @@ We are planning a super major update, called `ESPnet2`. The developing status is - You don't need to create the feature file before training, but just input wave data directly. - We support both raw wave input and extracted features. - The preprocessing for text, tokenization to characters, or sentencepieces, can be also applied during training. + - Support **self-supervised learning representations** from s3prl - Discarding the JSON format describing the training corpus. - Why do we discard the JSON format? Because a dict object generated from a large JSON file requires much memory and it also takes much time to parse such a large JSON file. - Support distributed data-parallel training (Not enough tested) @@ -179,7 +180,7 @@ You need to do one of the following two ways to change the training configuratio ```sh # Give a configuration file -./run.sh --asr_train_config conf/train_asr.yaml +./run.sh --asr_config conf/train_asr.yaml # Give arguments to "espnet2/bin/asr_train.py" directly ./run.sh --asr_args "--foo arg --bar arg2" ``` @@ -222,8 +223,7 @@ Note that you need to setup your environment correctly to use distributed traini - [Distributed training](./espnet2_distributed.md) - [Using Job scheduling system](./parallelization.md) - -## Use specified expereiment directory for evaluation +## Use specified experiment directory for evaluation If you already have trained a model, you may wonder how to give it to run.sh when you'll evaluate it later. By default the directory name is determined according to given options, `asr_args`, `lm_args`, or etc. @@ -244,4 +244,99 @@ You can overwrite it by `--asr_exp` and `--lm_exp`. ./run.sh --download_model --skip_train true ``` -You need to fill `model_name` by yourself. See the following link about our pretrain models: https://github.com/espnet/espnet_model_zoo +You need to fill `model_name` by yourself. You can search for pretrained models on Hugging Face using the tag [espnet](https://huggingface.co/models?library=espnet) + +(Deprecated: See the following link about our pretrain models: https://github.com/espnet/espnet_model_zoo) + +## Packing and sharing your trained model + +ESPnet encourages you to share your results using platforms like [Hugging Face](https://huggingface.co/) or [Zenodo](https://zenodo.org/) (This last will become deprecated.) + +For sharing your models, the last three stages of each task simplify this process. The model is packed into a zip file and uploaded to the selected platform (one or both). + +For **Hugging Face**, you need to first create a repository (` = /`). +Remember to install `git-lfs ` before continuing. +Then, execute `run.sh` as follows: + +```sh +# For ASR recipe +./run.sh --stage 14 --skip-upload-hf false --hf-repo + +# For TTS recipe +./run.sh --stage 8 --skip-upload-hf false --hf-repo +``` + +For **Zenodo**, you need to register your account first. Then, execute `run.sh` as follows: + +```sh +# For ASR recipe +./run.sh --stage 14 --skip-upload false + +# For TTS recipe +./run.sh --stage 8 --skip-upload false +``` + +The packed model can be uploaded to both platforms by setting the previously mentioned flags. + +## Usage of Self-Supervised Learning Representations as feature + +ESPnet supports self-supervised learning representations (SSLR) to replace traditional spectrum features. In some cases, SSLRs can boost the performance. + +To use SSLRs in your task, you need to make several modifications. + +### Prerequisite +1. Install [S3PRL](https://github.com/s3prl/s3prl) by `tools/installers/install_s3prl.sh`. +2. If HuBERT / Wav2Vec is needed, [fairseq](https://github.com/pytorch/fairseq) should be installed by `tools/installers/install_fairseq.sh`. + +### Usage +1. To reduce the time used in `collect_stats` step, please specify `--feats_normalize uttmvn` in `run.sh` and pass it as arguments to `asr.sh` or other task-specific scripts. (Recommended) +2. In the configuration file, specify the `frontend` and `preencoder`. Taking `HuBERT` as an example: + The `upstream` name can be whatever supported in S3PRL. `multilayer-feature=True` means the final representation is a weighted-sum of all layers' hidden states from SSLR model. + ``` + frontend: s3prl + frontend_conf: + frontend_conf: + upstream: hubert_large_ll60k # Note: If the upstream is changed, please change the input_size in the preencoder. + download_dir: ./hub + multilayer_feature: True + ``` + Here the `preencoder` is to reduce the input dimension to the encoder, to reduce the memory cost. The `input_size` depends on the upstream model, while the `output_size` can be set to any values. + ``` + preencoder: linear + preencoder_conf: + input_size: 1024 # Note: If the upstream is changed, please change this value accordingly. + output_size: 80 + ``` +3. Because the shift sizes of different `upstream` models are different, e.g. `HuBERT` and `Wav2Vec2.0` have `20ms` frameshift. Sometimes, the downsampling rate (`input_layer`) in the `encoder` configuration need to be changed. For example, using `input_layer: conv2d2` will results in a total frameshift of `40ms`, which is enough for some tasks. + +## Streaming ASR +ESPnet supports streaming Transformer/Conformer ASR with blockwise synchronous beam search. + +For more details, please refer to the [paper](https://arxiv.org/pdf/2006.14941.pdf). + +### Training + +To achieve streaming ASR, please employ blockwise Transformer/Conformer encoder in the configuration file. Taking `blockwise Transformer` as an example: +The `encoder` name can be `contextual_block_transformer` or `contextual_block_conformer`. + +```sh +encoder: contextual_block_transformer +encoder_conf: + block_size: 40 # block size for block processing + hop_size: 16 # hop size for block processing + look_ahead: 16 # look-ahead size for block processing + init_average: true # whether to use average input as initial context + ctx_pos_enc: true # whether to use positional encoding for the context vectors +``` + +### Decoding + +To enable online decoding, the argument `--use_streaming true` should be added to `run.sh`. + +```sh +./run.sh --stage 12 --use_streaming true +``` + +### FAQ +1. Issue about `'NoneType' object has no attribute 'max'` during training: Please make sure you employ `forward_train` function during traininig, check more details [here](https://github.com/espnet/espnet/issues/3803). +3. I successfully trained the model, but encountered the above issue during decoding: You may forget to specify `--use_streaming true` to select streaming inference. diff --git a/doc/index.rst b/doc/index.rst index 13f20ab0a96..30cd3d35fd4 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -28,16 +28,7 @@ ESPnet is an end-to-end speech processing toolkit, mainly focuses on end-to-end ./espnet2_task.md ./espnet2_distributed.md -.. toctree:: - :maxdepth: 1 - :caption: Notebook: - - ./notebook/asr_cli.ipynb - ./notebook/asr_library.ipynb - ./notebook/tts_cli.ipynb - ./notebook/pretrained.ipynb - ./notebook/tts_realtime_demo.ipynb - ./notebook/st_demo.ipynb +.. include:: ./_gen/notebooks.rst .. include:: ./_gen/modules.rst diff --git a/doc/installation.md b/doc/installation.md index e29ebf2e259..db45a09135b 100644 --- a/doc/installation.md +++ b/doc/installation.md @@ -32,14 +32,14 @@ the following packages are installed using Anaconda, so you can skip them.) # For CentOS $ sudo yum install libsndfile ``` -- ffmpeg (This is not required when installataion, but used in some recipes) +- ffmpeg (This is not required when installing, but used in some recipes) ```sh # For Ubuntu $ sudo apt-get install ffmpeg # For CentOS $ sudo yum install ffmpeg ``` -- flac (This is not required when installataion, but used in some recipes) +- flac (This is not required when installing, but used in some recipes) ```sh # For Ubuntu $ sudo apt-get install flac @@ -202,14 +202,14 @@ We also have [prebuilt Kaldi binaries](https://github.com/espnet/espnet/blob/mas ```sh $ cd /tools - $ make TH_VERSION=1.3.1 + $ make TH_VERSION=1.10.1 ``` Note that the CUDA version is derived from `nvcc` command. If you'd like to specify the other CUDA version, you need to give `CUDA_VERSION`. ```sh $ cd /tools - $ make TH_VERSION=1.3.1 CUDA_VERSION=10.1 + $ make TH_VERSION=1.10.1 CUDA_VERSION=11.3 ``` If you don't have `nvcc` command, packages are installed for CPU mode by default. @@ -255,7 +255,7 @@ e.g. ``` ### Check installation -You can check whether your installation is succesfully finished by +You can check whether your installation is successfully finished by ```sh cd /tools . ./activate_python.sh; python3 check_install.py diff --git a/doc/module2rst.py b/doc/module2rst.py index a4cd4db3f6c..7cb83b9e7ad 100755 --- a/doc/module2rst.py +++ b/doc/module2rst.py @@ -8,15 +8,15 @@ # parser parser = configargparse.ArgumentParser( - description='generate RST files from module recursively into /_gen', + description="generate RST files from module recursively into /_gen", config_file_parser_class=configargparse.YAMLConfigFileParser, - formatter_class=configargparse.ArgumentDefaultsHelpFormatter) -parser.add_argument('--root', nargs='+', - help='root module to generate docs recursively') -parser.add_argument('--dst', type=str, - help='destination path to generate RSTs') -parser.add_argument('--exclude', nargs='*', default=[], - help='exclude module name') + formatter_class=configargparse.ArgumentDefaultsHelpFormatter, +) +parser.add_argument( + "--root", nargs="+", help="root module to generate docs recursively" +) +parser.add_argument("--dst", type=str, help="destination path to generate RSTs") +parser.add_argument("--exclude", nargs="*", default=[], help="exclude module name") args = parser.parse_args() print(args) @@ -36,12 +36,14 @@ def gen_rst(module_path, f): doc = module.__doc__ if doc is None: doc = "" - f.write(f""" + f.write( + f""" {title} {sep} {doc} -""") +""" + ) for cpath in glob(module_path + "/**/*.py", recursive=True): print(cpath) @@ -51,7 +53,8 @@ def gen_rst(module_path, f): continue cname = to_module(cpath) csep = "-" * len(cname) - f.write(f""" + f.write( + f""" .. _{cname}: {cname} @@ -62,7 +65,8 @@ def gen_rst(module_path, f): :undoc-members: :show-inheritance: -""") +""" + ) f.flush() diff --git a/doc/notebook b/doc/notebook deleted file mode 160000 index ef3cbf880fc..00000000000 --- a/doc/notebook +++ /dev/null @@ -1 +0,0 @@ -Subproject commit ef3cbf880fcd725d11021e541a0cdfae4080446d diff --git a/doc/notebook2rst.sh b/doc/notebook2rst.sh new file mode 100755 index 00000000000..83bf7d57794 --- /dev/null +++ b/doc/notebook2rst.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -euo pipefail + +cd "$(dirname "$0")" + +if [ ! -d notebook ]; then + git clone https://github.com/espnet/notebook --depth 1 +fi + +echo "\ +.. toctree:: + :maxdepth: 1 + :caption: Notebook: +" + +find ./notebook/*.ipynb -exec echo " {}" \; diff --git a/doc/tutorial.md b/doc/tutorial.md index dd80f408b73..8428129fcdc 100644 --- a/doc/tutorial.md +++ b/doc/tutorial.md @@ -142,7 +142,7 @@ echo 2 `run.sh` has multiple stages including data prepration, traning, and etc., so you may likely want to start from the specified stage if some stages are failed by some reason for example. -You can start from specified stage as following and stop the process at the specifed stage: +You can start from specified stage as following and stop the process at the specified stage: ```bash # Start from 3rd stage and stop at 5th stage @@ -152,96 +152,147 @@ $ ./run.sh --stage 3 --stop-stage 5 ### CTC, attention, and hybrid CTC/attention -ESPnet can completely switch the mode from CTC, attention, and hybrid CTC/attention +ESPnet can easily switch the model's training/decoding mode from CTC, attention, and hybrid CTC/attention. + +Each mode can be trained by specifying `mtlalpha` in the [training configuration](https://github.com/espnet/espnet/blob/7dc9da2f07c54b4b0e878d8ef219fcd4d16a5bec/doc/tutorial.md#changing-the-training-configuration): ```sh # hybrid CTC/attention (default) -# --mtlalpha 0.5 and --ctc_weight 0.3 in most cases -$ ./run.sh +mtlalpha: 0.3 + +# CTC +mtlalpha: 1.0 + +# attention +mtlalpha: 0.0 +``` -# CTC mode -$ ./run.sh --mtlalpha 1.0 --ctc_weight 1.0 --recog_model model.loss.best +Decoding for each mode can be done using the following decoding configurations: -# attention mode -$ ./run.sh --mtlalpha 0.0 --ctc_weight 0.0 --maxlenratio 0.8 --minlenratio 0.3 +```sh +# hybrid CTC/attention (default) +ctc-weight: 0.3 +beam-size: 10 + +# CTC +ctc-weight: 1.0 +## for best path decoding +api: v1 # default setting (can be omitted) +## for prefix search decoding w/ beam search +api: v2 +beam-size: 10 + +# attention +ctc-weight: 0.0 +beam-size: 10 +maxlenratio: 0.8 +minlenratio: 0.3 ``` -- The CTC training mode does not output the validation accuracy, and the optimum model is selected with its loss value -(i.e., `--recog_model model.loss.best`). -- The pure attention mode requires to set the maximum and minimum hypothesis length (`--maxlenratio` and `--minlenratio`), appropriately. In general, if you have more insertion errors, you can decrease the `maxlenratio` value, while if you have more deletion errors you can increase the `minlenratio` value. Note that the optimum values depend on the ratio of the input frame and output label lengths, which is changed for each language and each BPE unit. +- The CTC mode does not compute the validation accuracy, and the optimum model is selected with its loss value +(i.e., `$ ./run.sh --recog_model model.loss.best`). +- The CTC decoding adopts the best path decoding by default, which simply outputs the most probable label at every time step. The prefix search deocding with beam search is also supported in [beam search API v2](https://espnet.github.io/espnet/apis/espnet_bin.html?highlight=api#asr-recog-py). +- The pure attention mode requires to set the maximum and minimum hypothesis length (`--maxlenratio` and `--minlenratio`), appropriately. In general, if you have more insertion errors, you can decrease the `maxlenratio` value, while if you have more deletion errors you can increase the `minlenratio` value. Note that the optimum values depend on the ratio of the input frame and output label lengths, which is changed for each language and each BPE unit. +- Negative `maxlenratio` can be used to set the constant maximum hypothesis length independently from the number of input frames. If `maxlenratio` is set to `-1`, the decoding will always stop after the first output, which can be used to emulate the utterance classification tasks. This is suitable for some spoken language understanding and speaker identification tasks. - About the effectiveness of hybrid CTC/attention during training and recognition, see [2] and [3]. For example, hybrid CTC/attention is not sensitive to the above maximum and minimum hypothesis heuristics. ### Transducer -ESPnet also supports transducer-based models. -To switch to transducer mode, the following should be set in the training config: +***Important: If you encounter any issue related to Transducer loss, please open an issue in [our fork of warp-transducer](https://github.com/b-flo/warp-transducer).*** + +ESPnet supports models trained with Transducer loss, aka Transducer models. To train such model, the following should be set in the training config: ``` criterion: loss model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E" ``` -Several transducer architectures are currently available: -- RNN-Transducer (default) -- Custom-Transducer (`etype: custom` and `dtype: custom`) +#### Architecture + +Several Transducer architectures are currently available in ESPnet: +- RNN-Transducer (default, e.g.: `etype: blstm` with `dtype: lstm`) +- Custom-Transducer (e.g.: `etype: custom` and `dtype: custom`) - Mixed Custom/RNN-Transducer (e.g: `etype: custom` with `dtype: lstm`) -The architecture specification is separated for the encoder and decoder parts, and defined by the user through, respectively, `etype` and `dtype` in training config. If `custom` is specified for either, a customizable architecture will be used for the corresponding part, otherwise a RNN-based architecture will be selected. +The architecture specification is separated for the encoder and decoder part, and defined by the user through, respectively, `etype` and `dtype` in the training config. If `custom` is specified for either, a customizable architecture will be used for the corresponding part. Otherwise, an RNN-based architecture will be selected. + +Here, the *custom* architecture is a unique feature of the Transducer model in ESPnet. It was made available to add some flexibility in the architecture definition and ease the reproduction of some SOTA Transducer models mixing different layers types or parameters within the same model part (encoder or decoder). As such, the architecture definition is different compared to the RNN architecture : -While defining a RNN architecture is done in an usual manner (similarly to CTC, Att and MTL) with global parameters, a customizable architecture definition for transducer is different: -1) Each blocks (or layers) for both network part should be specified individually through `enc-block-arch` or/and `dec-block-arch`: +1) Each block (or layer) of the custom architecture should be specified individually through `enc-block-arch` or/and `dec-block-arch` parameters: - # e.g: TDNN-Transformer encoder + # e.g: Conv-Transformer encoder etype: custom enc-block-arch: - - type: tdnn - idim: 512 - odim: 320 - ctx_size: 3 - dilation: 1 + - type: conv1d + idim: 80 + odim: 32 + kernel_size: [3, 7] + stride: [1, 2] + - type: conv1d + idim: 32 + odim: 32 + kernel_size: 3 + stride: 2 + - type: conv1d + idim: 32 + odim: 384 + kernel_size: 3 stride: 1 - type: transformer - d_hidden: 320 - d_ff: 320 + d_hidden: 384 + d_ff: 1536 heads: 4 -2) Each part has different allowed block type: `tdnn`, `conformer` or `transformer` for encoder and `causal-conv1d` or `transformer` for decoder. For each block type, a set of parameters are needed: - - # TDNN - - type: tdnn - idim: input dimension - odim: output dimension - ctx_size: size of the context window - dilation: parameter to control the stride of elements within the neighborhood - stride: stride of the sliding blocks - [optional: dropout-rate] +2) Different block types are allowed for the custom encoder (`tdnn`, `conformer` or `transformer`) and the custom decoder (`causal-conv1d` or `transformer`). Each one has a set of mandatory and optional parameters : + + # 1D convolution (TDNN) block + - type: conv1d + idim: [Input dimension. (int)] + odim: [Output dimension. (int)] + kernel_size: [Size of the context window. (int or tuple)] + stride (optional): [Stride of the sliding blocks. (int or tuple, default = 1)] + dilation (optional): [Parameter to control the stride of elements within the neighborhood. (int or tuple, default = 1)] + groups (optional): [Number of blocked connections from input channels to output channels. (int, default = 1) + bias (optional): [Whether to add a learnable bias to the output. (bool, default = True)] + use-relu (optional): [Whether to use a ReLU activation after convolution. (bool, default = True)] + use-batchnorm: [Whether to use batch normalization after convolution. (bool, default = False)] + dropout-rate (optional): [Dropout-rate for TDNN block. (float, default = 0.0)] # Transformer - type: transformer - d_hidden: input/output dimension - d_ff: feed-forward hidden dimension - heads: number of heads in multi-head attention - [optional: dropout-rate, pos-dropout-rate, att-dropout-rate] + d_hidden: [Input/output dimension of Transformer block. (int)] + d_ff: [Hidden dimension of the Feed-forward module. (int)] + heads: [Number of heads in multi-head attention. (int)] + dropout-rate (optional): [Dropout-rate for Transformer block. (float, default = 0.0)] + pos-dropout-rate (optional): [Dropout-rate for positional encoding module. (float, default = 0.0)] + att-dropout-rate (optional): [Dropout-rate for attention module. (float, default = 0.0)] # Conformer - type: conformer - d_hidden: input/output dimension - d_ff: feed-forward hidden dimension - heads: number of heads in multi-head attention - macaron_style: wheter to use macaron style - use_conv_mod: whether to use convolutional module - conv_mod_kernel: number of kernel in convolutional module (optional if `use_conv_mod=True`) - [optional: dropout-rate, pos-dropout-rate, att-dropout-rate] + d_hidden: [Input/output dimension of Conformer block (int)] + d_ff: [Hidden dimension of the Feed-forward module. (int)] + heads: [Number of heads in multi-head attention. (int)] + macaron_style: [Whether to use macaron style. (bool)] + use_conv_mod: [Whether to use convolutional module. (bool)] + conv_mod_kernel (required if use_conv_mod = True): [Number of kernel in convolutional module. (int)] + dropout-rate (optional): [Dropout-rate for Transformer block. (float, default = 0.0)] + pos-dropout-rate (optional): [Dropout-rate for positional encoding module. (float, default = 0.0)] + att-dropout-rate (optional): [Dropout-rate for attention module. (float, default = 0.0)] # Causal Conv1d - type: causal-conv1d - idim: input dimension - odim: output dimension - kernel_size: size of convolving kernel - stride: stride of the convolution - dilation: spacing between the kernel points - -3) Each specified block(s) for each network part can be repeated by specifying the number of duplications through `enc-block-repeat` or `dec-block-repeat` parameters: + idim: [Input dimension. (int)] + odim: [Output dimension. (int)] + kernel_size: [Size of the context window. (int)] + stride (optional): [Stride of the sliding blocks. (int, default = 1)] + dilation (optional): [Parameter to control the stride of elements within the neighborhood. (int, default = 1)] + groups (optional): [Number of blocked connections from input channels to output channels. (int, default = 1) + bias (optional): [Whether to add a learnable bias to the output. (bool, default = True)] + use-relu (optional): [Whether to use a ReLU activation after convolution. (bool, default = True)] + use-batchnorm: [Whether to use batch normalization after convolution. (bool, default = False)] + dropout-rate (optional): [Dropout-rate for TDNN block. (float, default = 0.0)] + +3) The defined architecture can be repeated by specifying the total number of blocks/layers in the architecture through `enc-block-repeat` or/and `dec-block-repeat` parameters: # e.g.: 2x (Causal-Conv1d + Transformer) decoder dtype: transformer @@ -258,47 +309,88 @@ While defining a RNN architecture is done in an usual manner (similarly to CTC, att-dropout-rate: 0.4 dec-block-repeat: 2 -For more information about the customizable architecture, please refer to [vivos config examples](https://github.com/espnet/espnet/tree/master/egs/vivos/asr1/conf/tuning/transducer) which cover all cases. +#### Multi-task learning + +We also support multi-task learning with various auxiliary losses, such as: CTC, cross-entropy w/ label-smoothing (LM loss), auxiliary Transducer, and symmetric KL divergence. +The four losses can be simultaneously trained with main Transducer loss to jointly optimize the total loss defined as: + +![augmented Transducer training](http://www.sciweavers.org/tex2img.php?eq=\mathcal{L}_{tot}%20%3D%20\lambda_{1}\mathcal{L}_{1}%20%2B%20\lambda_{2}\mathcal{L}_{2}%20%2B%20\lambda_{3}\mathcal{L}_{3}%20%2B%20\lambda_{4}%20\mathcal{L}_{4}%20%2B%20\lambda_{5}%20\mathcal{L}_{5}&bc=White&fc=Black&im=jpg&fs=12&ff=arev&edit=) + +where the losses are respectively, in order: The main Transducer loss, the CTC loss, the auxiliary Transducer loss, the symmetric KL divergence loss, and the LM loss. Lambda values define their respective contribution to the overall loss. Additionally, each loss can be independently selected or omitted depending on the task. + +Each loss can be defined in the training config alongside its specific options, such as follow: + + # Transducer loss (L1) + transducer-loss-weight: [Weight of the main Transducer loss (float)] + + # CTC loss (L2) + use-ctc-loss: True + ctc-loss-weight (optional): [Weight of the CTC loss. (float, default = 0.5)] + ctc-loss-dropout-rate (optional): [Dropout rate for encoder output representation. (float, default = 0.0)] + + # Auxiliary Transducer loss (L3) + use-aux-transducer-loss: True + aux-transducer-loss-weight (optional): [Weight of the auxiliary Transducer loss. (float, default = 0.4)] + aux-transducer-loss-enc-output-layers (required if use-aux-transducer-loss = True): [List of intermediate encoder layer IDs to compute auxiliary Transducer loss(es). (list)] + aux-transducer-loss-mlp-dim (optional): [Hidden dimension for the MLP network. (int, default = 320)] + aux-transducer-loss-mlp-dropout-rate: [Dropout rate for the MLP network. (float, default = 0.0)] + + # Symmetric KL divergence loss (L4) + # Note: It can be only used in addition to the auxiliary Transducer loss. + use-symm-kl-div-loss: True + symm-kl-div-loss-weight (optional): [Weight of the symmetric KL divergence loss. (float, default = 0.2)] + + # LM loss (L5) + use-lm-loss: True + lm-loss-weight (optional): [Weight of the LM loss. (float, default = 0.2)] + lm-loss-smoothing-rate: [Smoothing rate for LM loss. If > 0, label smoothing is enabled. (float, default = 0.0)] + +#### Inference + +Various decoding algorithms are also available for Transducer by setting `beam-size` and `search-type` parameters in decode config. -Various decoding algorithms are also available for transducer by setting `search-type` parameter in decode config: -- Default beam search (`default`) -- Time-synchronous decoding (`tsd`) -- Alignment-length decoding (`alsd`) -- N-step Constrained beam search (`nsc`) + - Greedy search constrained to one emission by timestep (`beam-size: 1`). + - Beam search algorithm without prefix search (`beam-size: >1` and `search-type: default`). + - Time Synchronous Decoding [[Saon et al., 2020]](https://ieeexplore.ieee.org/abstract/document/9053040) (`beam-size: >1` and `search-type: tsd`). + - Alignment-Length Synchronous Decoding [[Saon et al., 2020]](https://ieeexplore.ieee.org/abstract/document/9053040) (`beam-size: >1` and `search-type: alsd`). + - N-step Constrained beam search modified from [[Kim et al., 2020]](https://arxiv.org/abs/2002.03577) (`beam-size: >1` and `search-type: default`). + - modified Adaptive Expansion Search, based on [[Kim et al., 2021]](https://ieeexplore.ieee.org/abstract/document/9250505) and NSC (`beam-size: >1` and `search-type: maes`). -All algorithms share a common parameter to control beam size (`beam-size`) but each ones have its own parameters: +The algorithms share two parameters to control beam size (`beam-size`) and final hypotheses normalization (`score-norm-transducer`). The specific parameters for each algorithm are: # Default beam search search-type: default - score-norm-transducer: normalize final scores by length # Time-synchronous decoding search-type: tsd - max-sym-exp: number of maximum symbol expansions at each time step + max-sym-exp: [Number of maximum symbol expansions at each time step (int)] # Alignement-length decoding search-type: alsd - u-max: maximum output sequence length + u-max: [Maximum output sequence length (int)] # N-step Constrained beam search search-type: nsc - nstep: number of maximum expansion steps at each time step - (N exp. step = N symbol expansion + 1) - prefix-alpha: maximum prefix length in prefix search + nstep: [Number of maximum expansion steps at each time step (int)] + # nstep = max-sym-exp + 1 (blank) + prefix-alpha: [Maximum prefix length in prefix search (int)] -Except for the default algorithm, performance and decoding time can be controlled through described parameters. A high value will increase performance but also decoding time while a low value will decrease decoding time but will negatively impact performance. + # modified Adaptive Expansion Search + search-type: maes + nstep: [Number of maximum expansion steps at each time step (int, > 1)] + prefix-alpha: [Maximum prefix length in prefix search (int)] + expansion-gamma: [Number of additional candidates in expanded hypotheses selection (int)] + expansion-beta: [Allowed logp difference for prune-by-value method (float, > 0)] -IMPORTANT (temporary) note: ALSD, TSD and NSC have their execution time degraded because of the current batching implementation. We decided to keep it as if for internal discussions but it can be manually removed by the user to speed up inference. In a near future, the inference part for transducer will be replaced by our own torch lib. +Except for the default algorithm, the described parameters are used to control the performance and decoding speed. The optimal values for each parameter are task-dependent; a high value will typically increase decoding time to focus on performance while a low value will improve decoding time at the expense of performance. -The algorithm references can be found in [methods documentation](https://github.com/espnet/espnet/tree/master/espnet/nets/beam_search_transducer.py). For more information about decoding usage, refer to [vivos config examples](https://github.com/espnet/espnet/tree/master/egs/vivos/asr1/conf/tuning/transducer). +#### Additional notes -Additional notes: -- Similarly to CTC training mode, transducer does not output the validation accuracy. Thus, the optimum model is selected with its loss value (i.e., --recog_model model.loss.best). -- There are several differences between MTL and transducer training/decoding options. The users should refer to `espnet/espnet/nets/pytorch_backend/e2e_asr_transducer.py` for an overview. -- RNN-decoder pre-initialization using a LM is supported. The LM state dict keys (`predictor.*`) will be matched to AM state dict keys (`dec.*`). -- Transformer-decoder pre-initialization using a transformer LM is not supported yet. -- Transformer and conformer blocks within the same architecture part (i.e: encoder) is not supported yet. -- Customizable architecture is a in-progress work and will be eventually extended to RNN. Please report any encountered error or usage issue. +- Similarly to training with CTC, Transducer does not output the validation accuracy. Thus, the optimum model is selected with its loss value (i.e., --recog_model model.loss.best). +- There are several differences between MTL and Transducer training/decoding options. The users should refer to `espnet/espnet/nets/pytorch_backend/e2e_asr_transducer.py` for an overview and `espnet/espnet/nets/pytorch_backend/transducer/arguments` for all possible arguments. +- FastEmit regularization [[Yu et al., 2021]](https://arxiv.org/pdf/2010.11148) is available through `--fastemit-lambda` training parameter (default = 0.0). +- RNN-decoder pre-initialization using an LM is supported. Note that regular decoder keys are expected. The LM state dict keys (`predictor.*`) will be renamed according to AM state dict keys (`dec.*`). +- Transformer-decoder pre-initialization using a Transformer LM is not supported yet. ### Changing the training configuration @@ -374,7 +466,7 @@ We expect the user to define the following options in its main training config ( ### Important notes - Given a pre-trained source model, the modules specified for transfer learning are expected to have the same parameters (i.e.: layers and units) as the target model modules. -- We also support initialization with a pre-trained RNN LM for the RNN-transducer decoder. +- We also support initialization with a pre-trained RNN LM for the RNN-Transducer decoder. - RNN models use different key names for encoder and decoder parts compared to Transformer, Conformer or Custom models: - RNN model use `enc.` for encoder part and `dec.` for decoder part. - Transformer/Conformer/Custom model use `encoder.` for encoder part and `decoder.` for decoder part. diff --git a/docker/.default_args b/docker/.default_args new file mode 100644 index 00000000000..48cadcda9a6 --- /dev/null +++ b/docker/.default_args @@ -0,0 +1,2 @@ +docker_cuda=11.1 +docker_os=20.04 diff --git a/docker/.gitignore b/docker/.gitignore index 9937f78b3dc..a8a0eb3cee5 100644 --- a/docker/.gitignore +++ b/docker/.gitignore @@ -1 +1,4 @@ espnet-local.tar +.custom_args +*.log +*.done \ No newline at end of file diff --git a/docker/build.sh b/docker/build.sh index 19db299e9c8..987a0f54ac7 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -3,16 +3,19 @@ # 2019, Nelson Yalta # 2019, Ludwig Kürzinger, Technische Universität München +log() { + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" -tags="cpu-u18 - gpu-cuda10.0-cudnn7-u18 - gpu-cuda10.1-cudnn7-u18" -cuda_vers="10.0 - 10.1" -docker_ver=$(docker version -f '{{.Server.Version}}') -echo "Using Docker Ver.${docker_ver}" +# Default values +ubuntu_ver=20.04 +cuda_ver=11.1 +build_ver=cpu +build_cores=24 +th_ver=1.10.1 cmd_usage() { @@ -24,11 +27,12 @@ cmd_usage() { Also able to build containers based on local build configuration. USAGE - ${PROGRAM} + ${PROGRAM} ${PROGRAM} build_and_push - ${PROGRAM} local [cpu|9.1|9.2|10.0|10.1] + ${PROGRAM} --build-ver [cpu|gpu] local mode Select script functionality + args Set up building features Modes build build docker containers @@ -39,6 +43,15 @@ cmd_usage() { using the base image from Docker Hub (espnet/espnet:runtime) optional: cpu or CUDA version (default: cpu) fully_local like local, but also builds the base image + + Arguments + build-ver cpu/gpu + ubuntu-ver any ubuntu version available at docker hub (e.g. 18.04/20.04/...) + (default: 18.04) + cuda-ver any cuda version available at nvidia (e.g. 9.0/9.1/...) + (default: 10.1) + build-cores cores employed for building the container + th-ver Pytorch version for fully local build CAVEATS For local builds, the image pulled from Docker Hub is based on Ubuntu 16, @@ -51,68 +64,95 @@ cmd_usage() { build(){ - echo "Build docker containers" + log "Build Latest docker containers" # build runtime and gpu based containers - docker_image=$( docker images -q espnet/espnet:runtime ) + this_tag=espnet/espnet:runtime-latest + docker_image=$( docker images -q ${this_tag} ) if ! [[ -n ${docker_image} ]]; then - docker build --build-arg DOCKER_VER=${docker_ver} -f prebuilt/runtime/Dockerfile -t espnet/espnet:runtime . || exit 1 + log "Now building Runtime container" + docker build --build-arg DOCKER_VER=${docker_ver} \ + --build-arg FROM_TAG=${default_ubuntu_ver} \ + --build-arg NUM_BUILD_CORES=${build_cores} \ + -f prebuilt/runtime.dockerfile -t ${this_tag} . | tee -a build_runtime.log > /dev/null + + docker_image=$( docker images -q ${this_tag} ) + [ -z "${docker_image}" ] && exit 1 + fi + + this_tag=espnet/espnet:cuda-latest + docker_image=$( docker images -q ${this_tag} ) + if ! [[ -n ${docker_image} ]]; then + log "Now building CUDA container" + docker build --build-arg FROM_TAG=runtime-latest \ + -f prebuilt/gpu.dockerfile -t ${this_tag} . | tee -a build_cuda.log > /dev/null + docker_image=$( docker images -q ${this_tag} ) + [ -z "${docker_image}" ] && exit 1 fi - for ver in ${cuda_vers}; do - docker_image=$( docker images -q espnet/espnet:cuda${ver}-cudnn7 ) - if ! [[ -n ${docker_image} ]]; then - docker build -f prebuilt/devel/gpu/${ver}/cudnn7/Dockerfile -t espnet/espnet:cuda${ver}-cudnn7 . || exit 1 - fi - done # build cpu based - docker_image=$( docker images -q espnet/espnet:cpu-u18 ) + docker_image=$( docker images -q espnet/espnet:cpu-latest ) + this_tag=espnet/espnet:cpu-latest + docker_image=$( docker images -q ${this_tag} ) if ! [[ -n ${docker_image} ]]; then - echo "Now building cpu-u18" - docker build --build-arg FROM_TAG=runtime -f prebuilt/devel/Dockerfile -t espnet/espnet:cpu-u18 . || exit 1 + log "Now building cpu-latest with ubuntu:${default_ubuntu_ver}" + docker build --build-arg FROM_TAG=runtime-latest \ + -f prebuilt/devel.dockerfile \ + --target devel \ + -t ${this_tag} . | tee -a build_cpu.log > /dev/null + + docker_image=$( docker images -q ${this_tag} ) + [ -z "${docker_image}" ] && exit 1 fi + # build gpu based - for ver in ${cuda_vers}; do - build_args="--build-arg FROM_TAG=cuda${ver}-cudnn7" - build_args="${build_args} --build-arg CUDA_VER=${ver}" - docker_image=$( docker images -q espnet/espnet:gpu-cuda${ver}-cudnn7-u18 ) - if ! [[ -n ${docker_image} ]]; then - echo "Now building gpu-cuda${ver}-cudnn7-u18" - docker build ${build_args} -f prebuilt/devel/Dockerfile -t espnet/espnet:gpu-cuda${ver}-cudnn7-u18 . || exit 1 - fi - done + build_args="--build-arg FROM_TAG=cuda-latest + --build-arg CUDA_VER=${default_cuda_ver}" + this_tag=espnet/espnet:gpu-latest + docker_image=$( docker images -q ${this_tag} ) + if ! [[ -n ${docker_image} ]]; then + log "Now building gpu-latest with ubuntu:${default_ubuntu_ver} and cuda:${default_cuda_ver}" + docker build ${build_args} -f prebuilt/devel.dockerfile \ + --target devel \ + -t ${this_tag} . | tee -a build_gpu.log > /dev/null + docker_image=$( docker images -q ${this_tag} ) + [ -z "${docker_image}" ] && exit 1 + fi } build_local(){ - echo "Building docker container: base image, and image for ${ver}" + log "Building docker container: base image, and image for ${build_ver}" sleep 1 # prepare espnet-repo, assuming that this script is in folder espnet/docker cd ${SCRIPTPATH}/.. ESPNET_ARCHIVE="./espnet-local.tar" - echo "Reconstructing the local repository from the last commit" + log "Reconstructing the local repository from the last commit" git archive -o docker/${ESPNET_ARCHIVE} HEAD || exit 1 cd ${SCRIPTPATH} test -r ${ESPNET_ARCHIVE} || exit 1; sleep 1 - if [ "${build_base_image}" = true ] ; then - echo "building ESPnet base image" - docker build --build-arg DOCKER_VER=${docker_ver} -f prebuilt/runtime/Dockerfile -t espnet/espnet:runtime . || exit 1 + if [ "${build_base_image}" = true ]; then + log "building ESPnet base image with ubuntu:${ubuntu_ver}" + docker build --build-arg DOCKER_VER=${docker_ver} \ + --build-arg FROM_TAG=${ubuntu_ver} \ + --build-arg NUM_BUILD_CORES=${build_cores} \ + -f prebuilt/runtime/Dockerfile -t espnet/espnet:runtime-local . || exit 1 sleep 1 fi - if [[ ${ver} == "cpu" ]]; then - echo "building ESPnet CPU Image" - docker build --build-arg FROM_TAG=runtime --build-arg ESPNET_ARCHIVE=${ESPNET_ARCHIVE} \ + if [[ ${build_ver} == "cpu" ]]; then + log "building ESPnet CPU Image with ubuntu:${ubuntu_ver}" + docker build --build-arg FROM_TAG=runtime-local --build-arg ESPNET_ARCHIVE=${ESPNET_ARCHIVE} \ -f prebuilt/local/Dockerfile -t espnet/espnet:cpu-local . || exit 1 - elif [[ ${ver} =~ ^(9.1|9.2|10.0|10.1)$ ]]; then - echo "building ESPnet GPU Image for ${ver}" + elif [[ ${build_ver} == "gpu" ]]; then + log "building ESPnet GPU Image with ubuntu:${ubuntu_ver} and cuda:${cuda_ver}" if [ "${build_base_image}" = true ] ; then - docker build -f prebuilt/devel/gpu/${ver}/cudnn7/Dockerfile -t espnet/espnet:cuda${ver}-cudnn7 . || exit 1 + docker build -f prebuilt/devel/gpu/${ver}/Dockerfile -t espnet/espnet:cuda${ver}-cudnn7 . || exit 1 else - if ! [[ -n $( docker images -q espnet/espnet:cuda${ver}-cudnn7) ]]; then - docker pull espnet/espnet:cuda${ver}-cudnn7 + if ! [[ -n $( docker images -q espnet/espnet:cuda-latest) ]]; then + docker pull espnet/espnet:cuda-latest fi fi build_args="--build-arg FROM_TAG=cuda${ver}-cudnn7" @@ -120,105 +160,148 @@ build_local(){ build_args="${build_args} --build-arg ESPNET_ARCHIVE=${ESPNET_ARCHIVE}" docker build ${build_args} -f prebuilt/local/Dockerfile -t espnet/espnet:gpu-cuda${ver}-cudnn7-u18-local . || exit 1 else - echo "Parameter invalid: " ${ver} + log "ERROR: Parameter invalid: " ${ver} fi - echo "cleanup." + log "cleanup." test -r ${ESPNET_ARCHIVE} && rm ${ESPNET_ARCHIVE} } +run_recipe1(){ + ./run.sh --docker-egs mini_an4/asr1 \ + --docker-cmd run.sh \ + --docker-gpu ${1} \ + --verbose 1 \ + --backend ${2} \ + --ngpu ${3} \ + --stage ${4} \ + --tag train_nodev_${2}_${5} | tee -a ${PWD}/testing_${5}_${2}.log > /dev/null +} + +run_recipe2(){ + ./run.sh --docker-egs mini_an4/asr1 \ + --docker-cmd run.sh \ + --docker-gpu ${1} \ + --docker-env "NLTK_DATA=/espnet/egs2/mini_an4/asr1/nltk_data,HOME=/espnet/egs2/mini_an4/asr1" \ + --is-egs2 \ + --ngpu ${2} \ + --stage ${3} \ + --asr-tag train_nodev_${4} \ + --lm-tag train_nodev_${4} | tee -a ${PWD}/testing2_pytorch_${4}.log > /dev/null +} testing(){ - echo "Testing docker containers" + log "Testing docker containers" # Test Docker Containers with cpu setup run_stage=-1 - if [ -f ../egs/an4/asr1/dump/train_nodev/deltafalse/data.json ]; then - run_stage=3 - fi - for cuda_ver in cpu ${cuda_vers};do - for backend in pytorch chainer;do - if [ "${cuda_ver}" != "cpu" ];then - docker_cuda="--docker-cuda ${cuda_ver}" - gpu=0 - ngpu=1 - else - docker_cuda="" - gpu=-1 - ngpu=0 - fi - ( ./run.sh ${docker_cuda} \ - --docker-egs an4/asr1 \ - --docker-cmd run.sh \ - --docker-gpu ${gpu} \ - --verbose 1 \ - --backend ${backend} \ - --ngpu ${ngpu} \ - --stage ${run_stage} \ - --tag train_nodev_${backend}_cuda${cuda_ver} ) || exit 1 - done + for backend in chainer pytorch; do + if [ -f ../egs/mini_an4/asr1/dump/train_nodev/deltafalse/data.json ]; then + run_stage=3 + fi + if [ ! -f .test_cpu_${backend}.done ]; then + run_recipe1 -1 ${backend} 0 ${run_stage} "cpu" + touch .test_cpu_${backend}.done + fi done - echo "ESPnet egs Done. Press to continue with ESPnet2 egs" + for backend in chainer pytorch; do + if [ -f ../egs/mini_an4/asr1/dump/train_nodev/deltafalse/data.json ]; then + run_stage=3 + fi + if [ ! -f .test_gpu_${backend}.done ]; then + run_recipe1 0 ${backend} 1 ${run_stage} "gpu" + touch .test_gpu_${backend}.done + fi + done + + log "ESPnet egs Done. Press to continue with ESPnet2 egs" read enter # Test for espnet2 run_stage=-1 - if [ -f ../egs2/an4/asr1/dump/raw/train_nodev/text ]; then - run_stage=9 + # + if [ ! -f .test2_cpu_${backend}.done ]; then + run_recipe2 -1 0 ${run_stage} "cpu" + touch .test2_cpu_${backend}.done + fi + run_stage=6 + if [ ! -f .test2_gpu_${backend}.done ]; then + run_recipe2 0 1 ${run_stage} "gpu" + touch .test2_gpu_${backend}.done fi - for cuda_ver in cpu ${cuda_vers};do - if [ "${cuda_ver}" != "cpu" ];then - docker_cuda="--docker-cuda ${cuda_ver}" - gpu=0 - ngpu=1 - else - docker_cuda="" - gpu=-1 - ngpu=0 - fi - ( ./run.sh ${docker_cuda} \ - --docker-egs an4/asr1 \ - --docker-cmd run.sh \ - --docker-gpu ${gpu} \ - --is-egs2 \ - --ngpu ${ngpu} \ - --stage ${run_stage} \ - --asr_tag train_nodev_cuda${cuda_ver} \ - --lm_tag train_nodev_cuda${cuda_ver}) || exit 1 - run_stage=3 - done } push(){ - for tag in ${tags};do - echo "docker push espnet/espnet:${tag}" + for tag in runtime-latest cuda-latest cpu-latest gpu-latest;do + log "docker push espnet/espnet:${tag}" ( docker push espnet/espnet:${tag} )|| exit 1 done } +## Parameter initialization: +while test $# -gt 0 +do + case "$1" in + -h) cmd_usage + exit 0;; + --help) cmd_usage + exit 0;; + --*) ext=${1#--} + ext=${ext//-/_} + frombreak=true + for i in _ {a..z} {A..Z}; do + for var in `eval echo "\\${!${i}@}"`; do + if [ "$var" == "$ext" ]; then + eval ${ext}=$2 + frombreak=false + shift + break 2 + fi + done + done + if ${frombreak} ; then + echo "bad option $1" + exit 1 + fi + ;; + *) break + ;; + esac + shift +done -## Parameter initialization: cpu or gpu docker container (default: cpu) -if [[ -z "$2" ]]; then - ver='cpu' -else - ver=$2 + +mode=$1 +default_ubuntu_ver=20.04 +default_cuda_ver=11.1 + +check=true +[ "${default_ubuntu_ver}" != "${ubuntu_ver}" ] || [ "${default_cuda_ver}" != "${cuda_ver}" ] && check=false + +if [ ${check} = false ] && [ "${mode}" != "fully_local" ]; then + log "Error: Use of custom versions of Ubuntu (!=${default_ubuntu_ver}) and CUDA (!=${default_cuda_ver}) + is only available for == fully_local. + Exiting... " + exit 0; fi +docker_ver=$(docker version -f '{{.Server.Version}}') +log "Using Docker Ver.${docker_ver}" ## Application menu -if [[ $1 == "build" ]]; then +if [[ "${mode}" == "build" ]]; then build -elif [[ $1 == "local" ]]; then +elif [[ "${mode}" == "local" ]]; then build_base_image=false build_local -elif [[ $1 == "fully_local" ]]; then +elif [[ "${mode}" == "fully_local" ]]; then build_base_image=true build_local -elif [[ $1 == "push" ]]; then +elif [[ "${mode}" == "push" ]]; then push -elif [[ $1 == "test" ]]; then +elif [[ "${mode}" == "test" ]]; then testing -elif [[ $1 == "build_and_push" ]]; then +elif [[ "${mode}" == "build_and_push" ]]; then build testing push @@ -226,4 +309,4 @@ else cmd_usage fi -echo "$(basename "$0") done." +log "$(basename "$0") done." diff --git a/docker/espnet.dockerfile b/docker/espnet.dockerfile new file mode 100644 index 00000000000..b6295ca2a92 --- /dev/null +++ b/docker/espnet.dockerfile @@ -0,0 +1,23 @@ +ARG FROM_TAG +# For cuda-based images, The distribution will include cuda +FROM espnet/espnet:${FROM_TAG} +LABEL maintainer "Nelson Yalta " + +ARG THIS_USER +ARG THIS_UID +ARG EXTRA_LIBS + +# Add extra libraries (VC/TTS) + +RUN if [ ${EXTRA_LIBS} = true ]; then \ + cd /espnet/tools; \ + make extra; \ + fi + +# Add user to container +RUN if [ ! -z "${THIS_UID}" ]; then \ + useradd -m -r -u ${THIS_UID} -g root ${THIS_USER}; \ + fi + +USER ${THIS_USER} +WORKDIR / diff --git a/docker/prebuilt/Dockerfile b/docker/prebuilt/Dockerfile deleted file mode 100644 index 63bc7b73525..00000000000 --- a/docker/prebuilt/Dockerfile +++ /dev/null @@ -1,25 +0,0 @@ -ARG FROM_TAG -# For cuda-based images, The distribution will include cuda, cudnn, nccl -FROM espnet/espnet:${FROM_TAG} -LABEL maintainer "Nelson Yalta " - -ARG THIS_USER -ARG THIS_UID -ARG EXTRA_LIBS - -# Add extra libraries (VC/TTS) - -RUN if [ ${EXTRA_LIBS} = true ]; then \ - cd /espnet/tools; \ - . ./activate_python.sh; \ - pip install parallel_wavegan; \ - pip install git+https://github.com/cybertronai/pytorch-lamb; \ - fi - -# Add user to container -RUN if [ ! -z "${THIS_UID}" ]; then \ - useradd -m -r -u ${THIS_UID} -g root ${THIS_USER}; \ - fi - -USER ${THIS_USER} -WORKDIR / diff --git a/docker/prebuilt/devel.dockerfile b/docker/prebuilt/devel.dockerfile new file mode 100644 index 00000000000..95dc6a41059 --- /dev/null +++ b/docker/prebuilt/devel.dockerfile @@ -0,0 +1,83 @@ +ARG FROM_TAG +FROM espnet/espnet:${FROM_TAG} as devel +LABEL maintainer "Nelson Yalta " + +ARG CUDA_VER +ENV CUDA_VER ${CUDA_VER} + +ARG TH_VERSION +ENV TH_VERSION ${TH_VERSION} +WORKDIR / + +ARG ESPNET_LOCATION=https://github.com/espnet/espnet + +ENV PATH=/opt/miniconda/bin:${PATH} + +# Download ESPnet +RUN git clone ${ESPNET_LOCATION} && \ + cd espnet && \ + rm -rf docker egs egs2 espnet2 test utils && \ + rm -rf .git + +# Install espnet +WORKDIR /espnet/tools + +# Disable cupy test +# Docker build does not load libcuda.so.1 +# Replace nvidia-smi for nvcc because docker does not load nvidia-smi +RUN if [ -z "${CUDA_VER}" ]; then \ + echo "Build without CUDA" && \ + MY_OPTS='CUPY_VERSION=""'; \ + else \ + echo "Build with CUDA ${CUDA_VER}" && \ + # Docker containers cannot load cuda libs during build. + # So, their checks on cuda packages are disabled. + sed -i '200s|install.py|install.py --no-cuda --no-cupy |' Makefile && \ + export CFLAGS="-I${CUDA_HOME}/include ${CFLAGS}" && \ + MY_OPTS="CUDA_VERSION=${CUDA_VER}" && \ + . ./setup_cuda_env.sh /usr/local/cuda; \ + fi; \ + if [ ! -z "${TH_VERSION}" ]; then \ + MY_OPTS="${MY_OPTS} TH_VERSION=${TH_VERSION} "; \ + fi; \ + echo "Make with options ${MY_OPTS}" && \ + ln -s /opt/kaldi ./ && \ + rm -f activate_python.sh && touch activate_python.sh && \ + conda install -y conda "python=3.7.4" && \ + make KALDI=/opt/kaldi ${MY_OPTS} && \ + . ./activate_python.sh && \ + ./installers/install_warp-ctc.sh && \ + ./installers/install_kenlm.sh && \ + ./installers/install_chainer.sh cpu && \ + conda clean --all && \ + rm -f *.tar.* && \ + pip cache purge + +RUN rm -rf ../espnet + +WORKDIR / + + +#### For local docker +FROM devel as espnet_local +LABEL maintainer "Nelson Yalta " + +ARG CUDA_VER +WORKDIR / + +# IF using a local ESPNet repository, a temporary file containing the ESPnet git repo is copied over +ARG ESPNET_ARCHIVE=./espnet-local.tar +COPY ${ESPNET_ARCHIVE} /espnet-local.tar + + +# Download ESPnet +RUN echo "Getting ESPnet sources from local repository, in temporary file: " ${ESPNET_ARCHIVE} +RUN mkdir /espnet +RUN tar xf espnet-local.tar -C /espnet/ +RUN rm espnet-local.tar + +RUN cd espnet && \ + rm -rf docker egs test utils + +# Install espnet +WORKDIR /espnet/tools diff --git a/docker/prebuilt/devel/Dockerfile b/docker/prebuilt/devel/Dockerfile deleted file mode 100644 index 55ddfdcdf90..00000000000 --- a/docker/prebuilt/devel/Dockerfile +++ /dev/null @@ -1,44 +0,0 @@ -ARG FROM_TAG -FROM espnet/espnet:${FROM_TAG} -LABEL maintainer "Nelson Yalta " - -ARG CUDA_VER -WORKDIR / - -ARG ESPNET_LOCATION=https://github.com/espnet/espnet - -# Download ESPnet -RUN git clone ${ESPNET_LOCATION} && \ - cd espnet && \ - rm -rf docker egs egs2 espnet2 test utils - -# Install espnet -WORKDIR /espnet/tools - -# Disable cupy test -# Docker build does not load libcuda.so.1 -# Replace nvidia-smi for nvcc because docker does not load nvidia-smi -RUN if [ -z "$( which nvcc )" ]; then \ - echo "Build without CUDA" && \ - MY_OPTS='CUPY_VERSION="" TH_VERSION=1.6.0'; \ - else \ - echo "Build with CUDA" && \ - # Docker containers cannot load cuda libs during build. - # So, their checks on cuda packages are disabled. - sed -i '200s|install.py|install.py --no-cuda --no-cupy |' Makefile && \ - export CFLAGS="-I${CUDA_HOME}/include ${CFLAGS}" && \ - MY_OPTS="CUDA_VERSION=${CUDA_VER}" && \ - . ./setup_cuda_env.sh /usr/local/cuda; \ - fi; \ - if [ "${CUDA_VER}" = "10.1" ]; then \ - # warpctc is not supported from Pytorch 1.3.1 - MY_OPTS="${MY_OPTS} TH_VERSION=1.6.0"; \ - fi; \ - echo "Make with options ${MY_OPTS}" && \ - ln -s /kaldi ./ && \ - ./setup_anaconda.sh /miniconda espnet 3.7.4 && \ - make KALDI=/kaldi ${MY_OPTS} - -RUN rm -rf ../espnet - -WORKDIR / diff --git a/docker/prebuilt/devel/gpu/10.0/cudnn7/Dockerfile b/docker/prebuilt/devel/gpu/10.0/cudnn7/Dockerfile deleted file mode 100644 index 6556f9acab0..00000000000 --- a/docker/prebuilt/devel/gpu/10.0/cudnn7/Dockerfile +++ /dev/null @@ -1,73 +0,0 @@ -FROM espnet/espnet:runtime -LABEL maintainer "Nelson Yalta " - -## FROM CUDA 10.0 base - -RUN apt-get update && apt-get install -y --no-install-recommends gnupg2 curl ca-certificates && \ - curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \ - echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ - echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \ - apt-get purge --autoremove -y curl && \ - rm -rf /var/lib/apt/lists/* - -ENV CUDA_VERSION 10.0.130 - -ENV CUDA_PKG_VERSION 10-0=$CUDA_VERSION-1 - -# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-cudart-$CUDA_PKG_VERSION \ - cuda-compat-10-0 && \ - ln -s cuda-10.0 /usr/local/cuda && \ - rm -rf /var/lib/apt/lists/* - -# Required for nvidia-docker v1 -RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf - -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 - -# nvidia-container-runtime -ENV NVIDIA_VISIBLE_DEVICES all -ENV NVIDIA_DRIVER_CAPABILITIES compute,utility -ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=410,driver<411" - -ENV CUDA_HOME /usr/local/cuda - -## FROM CUDA 10.0 runtime - -ENV NCCL_VERSION 2.4.8 - -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-libraries-$CUDA_PKG_VERSION \ - cuda-nvtx-$CUDA_PKG_VERSION \ - libnccl2=$NCCL_VERSION-1+cuda10.0 && \ - apt-mark hold libnccl2 && \ - rm -rf /var/lib/apt/lists/* - - -## FROM CUDA 10.0 devel - -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-libraries-dev-$CUDA_PKG_VERSION \ - cuda-nvml-dev-$CUDA_PKG_VERSION \ - cuda-minimal-build-$CUDA_PKG_VERSION \ - cuda-command-line-tools-$CUDA_PKG_VERSION \ - libnccl-dev=$NCCL_VERSION-1+cuda10.0 && \ - rm -rf /var/lib/apt/lists/* - -ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs - -## FROM CUDA 10.0-CUDNN 7 devel - -ENV CUDNN_VERSION 7.6.5.32 -LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}" - -RUN apt-get update && apt-get install -y --no-install-recommends \ - libcudnn7=$CUDNN_VERSION-1+cuda10.0 \ - libcudnn7-dev=$CUDNN_VERSION-1+cuda10.0 && \ - apt-mark hold libcudnn7 && \ - rm -rf /var/lib/apt/lists/* - -WORKDIR / \ No newline at end of file diff --git a/docker/prebuilt/devel/gpu/10.1/cudnn7/Dockerfile b/docker/prebuilt/devel/gpu/10.1/cudnn7/Dockerfile deleted file mode 100644 index adcdace6628..00000000000 --- a/docker/prebuilt/devel/gpu/10.1/cudnn7/Dockerfile +++ /dev/null @@ -1,80 +0,0 @@ -FROM espnet/espnet:runtime -LABEL maintainer "Nelson Yalta " - -## FROM CUDA 10.1 base [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/10.1/base/Dockerfile] - -RUN apt-get update && apt-get install -y --no-install-recommends gnupg2 curl ca-certificates && \ - curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \ - echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ - echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \ - apt-get purge --autoremove -y curl && \ - rm -rf /var/lib/apt/lists/* - -ENV CUDA_VERSION 10.1.168 - -ENV CUDA_PKG_VERSION 10-1=$CUDA_VERSION-1 - -# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-cudart-$CUDA_PKG_VERSION \ - cuda-compat-10-1 && \ - ln -s cuda-10.1 /usr/local/cuda && \ - rm -rf /var/lib/apt/lists/* - -# Required for nvidia-docker v1 -RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf - -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 - -# nvidia-container-runtime -ENV NVIDIA_VISIBLE_DEVICES all -ENV NVIDIA_DRIVER_CAPABILITIES compute,utility -ENV NVIDIA_REQUIRE_CUDA "cuda>=10.1 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=396,driver<397 brand=tesla,driver>=410,driver<411" - -ENV CUDA_HOME /usr/local/cuda - -## FROM CUDA 10.1 runtime [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/10.1/runtime/Dockerfile] - -ENV NCCL_VERSION 2.7.8 - -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-libraries-$CUDA_PKG_VERSION \ - cuda-nvtx-$CUDA_PKG_VERSION \ - libnccl2=$NCCL_VERSION-1+cuda10.1 && \ - apt-mark hold libnccl2 && \ - rm -rf /var/lib/apt/lists/* - -## FROM CUDA 10.1 devel [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/10.1/devel/Dockerfile] - -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-nvml-dev-$CUDA_PKG_VERSION \ - cuda-command-line-tools-$CUDA_PKG_VERSION \ - cuda-nvprof-$CUDA_PKG_VERSION \ - cuda-npp-dev-$CUDA_PKG_VERSION \ - cuda-libraries-dev-$CUDA_PKG_VERSION \ - cuda-minimal-build-$CUDA_PKG_VERSION \ - libcublas-dev=10.2.1.243-1 \ - libnccl-dev=2.7.8-1+cuda10.1 && \ - apt-mark hold libnccl-dev && \ - rm -rf /var/lib/apt/lists/* - -# apt from auto upgrading the cublas package. See https://gitlab.com/nvidia/container-images/cuda/-/issues/88 -RUN apt-mark hold libcublas-dev - - -ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs - -## FROM CUDA 10.1-CUDNN 7 devel - -ENV CUDNN_VERSION 7.6.0.64 -LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}" - -RUN apt-get update && apt-get install -y --no-install-recommends \ - libcudnn7=$CUDNN_VERSION-1+cuda10.1 \ - libcudnn7-dev=$CUDNN_VERSION-1+cuda10.1 && \ - apt-mark hold libcudnn7 && \ - rm -rf /var/lib/apt/lists/* - -WORKDIR / \ No newline at end of file diff --git a/docker/prebuilt/devel/gpu/8.0/cudnn7/Dockerfile b/docker/prebuilt/devel/gpu/8.0/cudnn7/Dockerfile deleted file mode 100644 index 64379a2531e..00000000000 --- a/docker/prebuilt/devel/gpu/8.0/cudnn7/Dockerfile +++ /dev/null @@ -1,79 +0,0 @@ -FROM espnet/espnet:runtime -LABEL maintainer "Nelson Yalta " - -## FROM CUDA 8.0 runtime - -RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \ - rm -rf /var/lib/apt/lists/* && \ - NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ - NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ - apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \ - echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \ - echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list - -ENV CUDA_VERSION 8.0.61 - -ENV CUDA_PKG_VERSION 8-0=$CUDA_VERSION-1 -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-nvrtc-$CUDA_PKG_VERSION \ - cuda-nvgraph-$CUDA_PKG_VERSION \ - cuda-cusolver-$CUDA_PKG_VERSION \ - cuda-cublas-8-0=8.0.61.2-1 \ - cuda-cufft-$CUDA_PKG_VERSION \ - cuda-curand-$CUDA_PKG_VERSION \ - cuda-cusparse-$CUDA_PKG_VERSION \ - cuda-npp-$CUDA_PKG_VERSION \ - cuda-cudart-$CUDA_PKG_VERSION && \ - ln -s cuda-8.0 /usr/local/cuda && \ - rm -rf /var/lib/apt/lists/* - -# nvidia-docker 1.0 -LABEL com.nvidia.volumes.needed="nvidia_driver" -LABEL com.nvidia.cuda.version="${CUDA_VERSION}" - -RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf - -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 -ENV CUDA_HOME /usr/local/cuda - -# nvidia-container-runtime -ENV NVIDIA_VISIBLE_DEVICES all -ENV NVIDIA_DRIVER_CAPABILITIES compute,utility -ENV NVIDIA_REQUIRE_CUDA "cuda>=8.0" - -## FROM CUDA 8.0 devel - -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-core-$CUDA_PKG_VERSION \ - cuda-misc-headers-$CUDA_PKG_VERSION \ - cuda-command-line-tools-$CUDA_PKG_VERSION \ - cuda-nvrtc-dev-$CUDA_PKG_VERSION \ - cuda-nvml-dev-$CUDA_PKG_VERSION \ - cuda-nvgraph-dev-$CUDA_PKG_VERSION \ - cuda-cusolver-dev-$CUDA_PKG_VERSION \ - cuda-cublas-dev-8-0=8.0.61.2-1 \ - cuda-cufft-dev-$CUDA_PKG_VERSION \ - cuda-curand-dev-$CUDA_PKG_VERSION \ - cuda-cusparse-dev-$CUDA_PKG_VERSION \ - cuda-npp-dev-$CUDA_PKG_VERSION \ - cuda-cudart-dev-$CUDA_PKG_VERSION \ - cuda-driver-dev-$CUDA_PKG_VERSION && \ - rm -rf /var/lib/apt/lists/* - -ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs - -## FROM CUDA 8.0 CUDNN 7 devel - -RUN echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list - -ENV CUDNN_VERSION 7.2.1.38 -LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}" - -RUN apt-get update && apt-get install -y --no-install-recommends \ - libcudnn7=$CUDNN_VERSION-1+cuda8.0 \ - libcudnn7-dev=$CUDNN_VERSION-1+cuda8.0 && \ - apt-mark hold libcudnn7 && \ - rm -rf /var/lib/apt/lists/* diff --git a/docker/prebuilt/devel/gpu/9.0/cudnn7/Dockerfile b/docker/prebuilt/devel/gpu/9.0/cudnn7/Dockerfile deleted file mode 100644 index a4147705ef1..00000000000 --- a/docker/prebuilt/devel/gpu/9.0/cudnn7/Dockerfile +++ /dev/null @@ -1,75 +0,0 @@ -FROM espnet/espnet:runtime -LABEL maintainer "Nelson Yalta " - -## FROM CUDA 9.0 base - -RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \ - rm -rf /var/lib/apt/lists/* && \ - NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ - NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ - apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \ - echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \ - echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ - echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list - -ENV CUDA_VERSION 9.0.176 - -ENV CUDA_PKG_VERSION 9-0=$CUDA_VERSION-1 -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-cudart-$CUDA_PKG_VERSION && \ - ln -s cuda-9.0 /usr/local/cuda && \ - rm -rf /var/lib/apt/lists/* - -# nvidia-docker 1.0 -LABEL com.nvidia.volumes.needed="nvidia_driver" -LABEL com.nvidia.cuda.version="${CUDA_VERSION}" - -RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf - -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 -ENV CUDA_HOME /usr/local/cuda - -# nvidia-container-runtime -ENV NVIDIA_VISIBLE_DEVICES all -ENV NVIDIA_DRIVER_CAPABILITIES compute,utility -ENV NVIDIA_REQUIRE_CUDA "cuda>=9.0" - -## FROM CUDA 9.0 runtime - -ENV NCCL_VERSION 2.4.2 - -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-libraries-$CUDA_PKG_VERSION \ - cuda-cublas-9-0=9.0.176.4-1 \ - libnccl2=$NCCL_VERSION-1+cuda9.0 && \ - apt-mark hold libnccl2 && \ - rm -rf /var/lib/apt/lists/* - -## FROM CUDA 9.0 devel - -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-libraries-dev-$CUDA_PKG_VERSION \ - cuda-nvml-dev-$CUDA_PKG_VERSION \ - cuda-minimal-build-$CUDA_PKG_VERSION \ - cuda-command-line-tools-$CUDA_PKG_VERSION \ - cuda-core-9-0=9.0.176.3-1 \ - cuda-cublas-dev-9-0=9.0.176.4-1 \ - libnccl-dev=$NCCL_VERSION-1+cuda9.0 && \ - rm -rf /var/lib/apt/lists/* - -ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs - -## FROM CUDA 9.0 CUDNN 7 devel - -ENV CUDNN_VERSION 7.4.2.24 -LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}" - -RUN apt-get update && apt-get install -y --no-install-recommends \ - libcudnn7=$CUDNN_VERSION-1+cuda9.0 \ - libcudnn7-dev=$CUDNN_VERSION-1+cuda9.0 && \ - apt-mark hold libcudnn7 && \ - rm -rf /var/lib/apt/lists/* - diff --git a/docker/prebuilt/devel/gpu/9.1/cudnn7/Dockerfile b/docker/prebuilt/devel/gpu/9.1/cudnn7/Dockerfile deleted file mode 100644 index 05823cf717a..00000000000 --- a/docker/prebuilt/devel/gpu/9.1/cudnn7/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -FROM espnet/espnet:runtime -LABEL maintainer "Nelson Yalta " - -## FROM CUDA 9.1 base - -RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \ - rm -rf /var/lib/apt/lists/* && \ - NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ - NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ - apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \ - echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \ - echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ - echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list - -ENV CUDA_VERSION 9.1.85 - -ENV CUDA_PKG_VERSION 9-1=$CUDA_VERSION-1 -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-cudart-$CUDA_PKG_VERSION && \ - ln -s cuda-9.1 /usr/local/cuda && \ - rm -rf /var/lib/apt/lists/* - -# nvidia-docker 1.0 -LABEL com.nvidia.volumes.needed="nvidia_driver" -LABEL com.nvidia.cuda.version="${CUDA_VERSION}" - -RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf - -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 -ENV CUDA_HOME /usr/local/cuda - -# nvidia-container-runtime -ENV NVIDIA_VISIBLE_DEVICES all -ENV NVIDIA_DRIVER_CAPABILITIES compute,utility -ENV NVIDIA_REQUIRE_CUDA "cuda>=9.1" - -## FROM CUDA 9.1 runtime - -ENV NCCL_VERSION 2.2.12 - -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-libraries-$CUDA_PKG_VERSION \ - libnccl2=$NCCL_VERSION-1+cuda9.1 && \ - apt-mark hold libnccl2 && \ - rm -rf /var/lib/apt/lists/* - -## FROM CUDA 9.1 devel - -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-libraries-dev-$CUDA_PKG_VERSION \ - cuda-nvml-dev-$CUDA_PKG_VERSION \ - cuda-minimal-build-$CUDA_PKG_VERSION \ - cuda-command-line-tools-$CUDA_PKG_VERSION \ - libnccl-dev=$NCCL_VERSION-1+cuda9.1 && \ - rm -rf /var/lib/apt/lists/* - -ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs - -## FROM CUDA 9.1 CUDNN 7 - -ENV CUDNN_VERSION 7.1.2.21 -LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}" - -RUN apt-get update && apt-get install -y --no-install-recommends \ - libcudnn7=$CUDNN_VERSION-1+cuda9.1 \ - libcudnn7-dev=$CUDNN_VERSION-1+cuda9.1 && \ - apt-mark hold libcudnn7 && \ - rm -rf /var/lib/apt/lists/* diff --git a/docker/prebuilt/devel/gpu/9.2/cudnn7/Dockerfile b/docker/prebuilt/devel/gpu/9.2/cudnn7/Dockerfile deleted file mode 100644 index c8c00dc5a01..00000000000 --- a/docker/prebuilt/devel/gpu/9.2/cudnn7/Dockerfile +++ /dev/null @@ -1,72 +0,0 @@ -FROM espnet/espnet:runtime -LABEL maintainer "Nelson Yalta " - -## FROM CUDA 9.2 base [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/9.2/base/Dockerfile] -# CUDA 9.2 is not officially supported on ubuntu 18.04 yet, the ubuntu 17.10 repository for CUDA were used instead. -RUN apt-get update && apt-get install -y --no-install-recommends gnupg2 curl ca-certificates && \ - curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1710/x86_64/7fa2af80.pub | apt-key add - && \ - echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1710/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ - echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \ - apt-get purge --autoremove -y curl && \ - rm -rf /var/lib/apt/lists/* - -ENV CUDA_VERSION 9.2.148 - -ENV CUDA_PKG_VERSION 9-2=$CUDA_VERSION-1 -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-cudart-$CUDA_PKG_VERSION && \ - ln -s cuda-9.2 /usr/local/cuda && \ - rm -rf /var/lib/apt/lists/* - -# nvidia-docker 1.0 -LABEL com.nvidia.volumes.needed="nvidia_driver" -LABEL com.nvidia.cuda.version="${CUDA_VERSION}" - -RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf - -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 - -# nvidia-container-runtime -ENV NVIDIA_VISIBLE_DEVICES all -ENV NVIDIA_DRIVER_CAPABILITIES compute,utility -ENV NVIDIA_REQUIRE_CUDA "cuda>=9.2" - -ENV CUDA_HOME /usr/local/cuda - -## FROM CUDA 9.2 runtime - -ENV NCCL_VERSION 2.3.7 - -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-libraries-$CUDA_PKG_VERSION \ - cuda-nvtx-$CUDA_PKG_VERSION \ - libnccl2=$NCCL_VERSION-1+cuda9.2 && \ - apt-mark hold libnccl2 && \ - rm -rf /var/lib/apt/lists/* - -## FROM CUDA 9.2 devel [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/9.2/devel/Dockerfile] - -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-libraries-dev-$CUDA_PKG_VERSION \ - cuda-nvml-dev-$CUDA_PKG_VERSION \ - cuda-minimal-build-$CUDA_PKG_VERSION \ - cuda-command-line-tools-$CUDA_PKG_VERSION \ - libnccl-dev=$NCCL_VERSION-1+cuda9.2 && \ - rm -rf /var/lib/apt/lists/* - -ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs - -## FROM CUDA 9.2-CUDNN 7 devel [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/9.2/devel/cudnn7/Dockerfile] - -ENV CUDNN_VERSION 7.5.0.56 -LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}" - -RUN apt-get update && apt-get install -y --no-install-recommends \ - libcudnn7=$CUDNN_VERSION-1+cuda9.2 \ - libcudnn7-dev=$CUDNN_VERSION-1+cuda9.2 && \ - apt-mark hold libcudnn7 && \ - rm -rf /var/lib/apt/lists/* - -WORKDIR / \ No newline at end of file diff --git a/docker/prebuilt/gpu.dockerfile b/docker/prebuilt/gpu.dockerfile new file mode 100644 index 00000000000..a94504dc52c --- /dev/null +++ b/docker/prebuilt/gpu.dockerfile @@ -0,0 +1,60 @@ +ARG FROM_TAG +ARG NUM_BUILD_CORES=8 +ARG DOCKER_VER +FROM espnet/espnet:${FROM_TAG} AS cuda_builder +LABEL maintainer "Nelson Yalta " + +## FROM CUDA 11.1 base +## [https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/11.1.1/ubuntu20.04-x86_64/base/Dockerfile] +RUN apt-get update && apt-get install -y --no-install-recommends \ + gnupg2 curl ca-certificates && \ + curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub | apt-key add - && \ + echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ + echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \ + apt-get purge --autoremove -y curl \ + && rm -rf /var/lib/apt/lists/* + +ENV CUDA_VERSION 11.1.1 + +# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a +RUN apt-get update && apt-get install -y --no-install-recommends \ + cuda-cudart-11-1=11.1.74-1 \ + cuda-compat-11-1 \ + && ln -s cuda-11.1 /usr/local/cuda && \ + rm -rf /var/lib/apt/lists/* + +# Required for nvidia-docker v1 +RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \ + && echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf + +ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} +ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 + +# nvidia-container-runtime +ENV NVIDIA_VISIBLE_DEVICES all +ENV NVIDIA_DRIVER_CAPABILITIES compute,utility +ENV NVIDIA_REQUIRE_CUDA "cuda>=11.1 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441 driver>=450" + +ENV CUDA_HOME /usr/local/cuda + +## FROM CUDA 11.1 devel +## [https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/11.1.1/ubuntu20.04-x86_64/devel/Dockerfile] +ENV NCCL_VERSION 2.8.4 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + libtinfo5 libncursesw5 \ + cuda-cudart-dev-11-1=11.1.74-1 \ + cuda-command-line-tools-11-1=11.1.1-1 \ + cuda-minimal-build-11-1=11.1.1-1 \ + cuda-libraries-dev-11-1=11.1.1-1 \ + cuda-nvml-dev-11-1=11.1.74-1 \ + libnpp-dev-11-1=11.1.2.301-1 \ + libcublas-dev-11-1=11.3.0.106-1 \ + libcusparse-dev-11-1=11.3.0.10-1 \ + && rm -rf /var/lib/apt/lists/* + +# apt from auto upgrading the cublas package. See https://gitlab.com/nvidia/container-images/cuda/-/issues/88 +RUN apt-mark hold libcublas-dev-11-1 +ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs + +WORKDIR / diff --git a/docker/prebuilt/local/Dockerfile b/docker/prebuilt/local/Dockerfile deleted file mode 100644 index 15939185aff..00000000000 --- a/docker/prebuilt/local/Dockerfile +++ /dev/null @@ -1,50 +0,0 @@ -ARG FROM_TAG -FROM espnet/espnet:${FROM_TAG} -LABEL maintainer "Nelson Yalta " - -ARG CUDA_VER -WORKDIR / - -# IF using a local ESPNet repository, a temporary file containing the ESPnet git repo is copied over -ARG ESPNET_ARCHIVE=./espnet-local.tar -COPY ${ESPNET_ARCHIVE} /espnet-local.tar - - -# Download ESPnet -RUN echo "Getting ESPnet sources from local repository, in temporary file: " ${ESPNET_ARCHIVE} -RUN mkdir /espnet -RUN tar xf espnet-local.tar -C /espnet/ -RUN rm espnet-local.tar - -RUN cd espnet && \ - rm -rf docker egs test utils - -# Install espnet -WORKDIR /espnet/tools - -# Replace nvidia-smi for nvcc because docker does not load nvidia-smi -RUN if [ -z "$( which nvcc )" ]; then \ - echo "Build without CUDA" && \ - MY_OPTS='CUPY_VERSION="" TH_VERSION=1.6.0'; \ - else \ - echo "Build with CUDA" && \ - # Disable cupy test - # Docker build does not load libcuda.so.1 - # So, their checks on cuda packages are disabled. - sed -i '200s|install.py|install.py --no-cuda --no-cupy |' Makefile && \ - export CFLAGS="-I${CUDA_HOME}/include ${CFLAGS}" && \ - MY_OPTS="CUDA_VERSION=${CUDA_VER}" && \ - . ./setup_cuda_env.sh /usr/local/cuda; \ - fi; \ - if [ "${CUDA_VER}" = "10.1" ]; then \ - # warpctc is not supported from Pytorch 1.3.1 - MY_OPTS="${MY_OPTS} TH_VERSION=1.6.0"; \ - fi; \ - echo "Make with options ${MY_OPTS}" && \ - ln -s /kaldi ./ && \ - ./setup_anaconda.sh /miniconda espnet 3.7.4 && \ - make KALDI=/kaldi ${MY_OPTS} - -RUN rm -rf ../espnet - -WORKDIR / diff --git a/docker/prebuilt/runtime.dockerfile b/docker/prebuilt/runtime.dockerfile new file mode 100644 index 00000000000..5f54ed90c90 --- /dev/null +++ b/docker/prebuilt/runtime.dockerfile @@ -0,0 +1,72 @@ +ARG FROM_TAG +ARG NUM_BUILD_CORES=8 +ARG DOCKER_VER + +FROM ubuntu:${FROM_TAG} AS main_builder +LABEL maintainer "Nelson Yalta " + +ENV DOCKER_BUILT_VER ${DOCKER_VER} +ENV NUM_BUILD_CORES ${NUM_BUILD_CORES} + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive \ + apt-get -y install --no-install-recommends \ + automake \ + autoconf \ + apt-utils \ + bc \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + flac \ + ffmpeg \ + gawk \ + gfortran \ + git \ + libboost-all-dev \ + libtool \ + libbz2-dev \ + liblzma-dev \ + libsndfile1-dev \ + patch \ + python2.7 \ + python3 \ + software-properties-common \ + sox \ + subversion \ + unzip \ + wget \ + zip \ + zlib1g-dev \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Latest version of git +RUN add-apt-repository ppa:git-core/ppa -y && \ + apt update && \ + apt install -y --no-install-recommends git-all && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi /opt/kaldi + +RUN wget --tries=3 -nv "https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh" -O miniconda.sh && \ + bash miniconda.sh -b -p /opt/miniconda && \ + rm miniconda.sh + +WORKDIR / + +FROM main_builder AS espnet1 +# # Using kaldi pre-built binaries +RUN cd /opt/kaldi/tools && \ + echo "" > extras/check_dependencies.sh && \ + chmod +x extras/check_dependencies.sh && \ + cd /opt/kaldi && \ + wget --tries=3 -nv https://github.com/espnet/kaldi-bin/releases/download/v0.0.1/ubuntu16-featbin.tar.gz && \ + tar -xf ./ubuntu16-featbin.tar.gz && \ + cp featbin/* src/featbin/ && \ + rm -rf featbin && \ + rm -f ubuntu16-featbin.tar.gz + +WORKDIR / diff --git a/docker/prebuilt/runtime/Dockerfile b/docker/prebuilt/runtime/Dockerfile deleted file mode 100644 index 1e1bc66a12c..00000000000 --- a/docker/prebuilt/runtime/Dockerfile +++ /dev/null @@ -1,60 +0,0 @@ -FROM ubuntu:18.04 -LABEL maintainer "Nelson Yalta " - -ARG DOCKER_VER -ENV DOCKER_BUILT_VER ${DOCKER_VER}} - -ARG NUM_BUILD_CORES=8 -ENV NUM_BUILD_CORES ${NUM_BUILD_CORES} - -RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get -y install --no-install-recommends \ - automake \ - autoconf \ - apt-utils \ - bc \ - build-essential \ - ca-certificates \ - cmake \ - curl \ - flac \ - ffmpeg \ - gawk \ - gfortran \ - git \ - libtool \ - libsndfile1-dev \ - python2.7 \ - python3 \ - sox \ - subversion \ - unzip \ - wget \ - zip \ - zlib1g-dev \ - && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# Install Kaldi -RUN git clone https://github.com/kaldi-asr/kaldi - -RUN cd /kaldi/tools && \ - ./extras/install_mkl.sh -sp debian intel-mkl-64bit-2019.2-057 && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - make all && \ - rm -r openfst-*/src && \ - ./extras/install_beamformit.sh && \ - ./extras/install_irstlm.sh && \ - cd /kaldi/src && \ - ./configure --shared --use-cuda=no && \ - make depend -j${NUM_BUILD_CORES} && \ - make -j${NUM_BUILD_CORES} && \ - find /kaldi/src -name "*.o" -exec rm -f {} \; && \ - find /kaldi/src -name "*.o" -exec rm -f {} \; - -RUN wget --tries=3 "https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh" -O miniconda.sh && \ - bash miniconda.sh -b -p /miniconda && \ - rm miniconda.sh - -WORKDIR / \ No newline at end of file diff --git a/docker/run.sh b/docker/run.sh index 4f5500a41b9..cff0d5604bc 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -3,11 +3,10 @@ docker_gpu=0 docker_egs= docker_folders= -docker_cuda=10.1 +docker_tag=latest docker_env= docker_cmd= -docker_os=u18 is_root=false is_local=false @@ -69,25 +68,19 @@ fi from_tag="cpu" if [ ! "${docker_gpu}" == "-1" ]; then - if [ -z "${docker_cuda}" ]; then - # If the docker_cuda is not set, the program will automatically - # search the installed version with default configurations (apt) - docker_cuda=$( nvcc -V | grep release ) - docker_cuda=${docker_cuda#*"release "} - docker_cuda=${docker_cuda%,*} - fi + docker_cuda=$( nvcc -V | grep release ) + docker_cuda=${docker_cuda#*"release "} + docker_cuda=${docker_cuda%,*} + # After search for your cuda version, if the variable docker_cuda is empty the program will raise an error if [ -z "${docker_cuda}" ]; then - echo "CUDA was not found in your system. Use CPU image or install NVIDIA-DOCKER, CUDA and NVCC for GPU image." + echo "CUDA was not found in your system. Use CPU image or install NVIDIA-DOCKER, CUDA for GPU image." exit 1 - else - from_tag="gpu-cuda${docker_cuda}-cudnn7" fi + from_tag="gpu" fi -if [ ! -z "${docker_os}" ]; then - from_tag="${from_tag}-${docker_os}" -fi +from_tag="${from_tag}-${docker_tag}" EXTRAS=${is_extras} @@ -123,8 +116,8 @@ if [ ${is_root} = false ]; then build_args="${build_args} --build-arg THIS_UID=${UID}" build_args="${build_args} --build-arg EXTRA_LIBS=${EXTRAS}" - echo "Now running docker build ${build_args} -f prebuilt/Dockerfile -t espnet/espnet:${container_tag} ." - (docker build ${build_args} -f prebuilt/Dockerfile -t espnet/espnet:${container_tag} .) || exit 1 + echo "Now running docker build ${build_args} -f espnet.dockerfile -t espnet/espnet:${container_tag} ." + (docker build ${build_args} -f espnet.dockerfile -t espnet/espnet:${container_tag} .) || exit 1 fi else container_tag=${from_tag} diff --git a/egs/README.md b/egs/README.md index 2ea843d193b..61951b84d47 100755 --- a/egs/README.md +++ b/egs/README.md @@ -13,7 +13,7 @@ See: https://espnet.github.io/espnet/tutorial.html | aishell2 | AISHELL-2 Open Source Mandarin Speech Corpus | ASR | ZH | http://www.aishelltech.com/aishell_2 | | ami | The AMI Meeting Corpus | ASR | EN | http://groups.inf.ed.ac.uk/ami/corpus/ | | | an4 | CMU AN4 database | ASR/TTS | EN | http://www.speech.cs.cmu.edu/databases/an4/ | | -| arctic | CMU ARCTIC databases | TTS, VC | EN, EN -> EN | http://www.festvox.org/cmu_arctic/ | | +| arctic | CMU ARCTIC databases | TTS, VC | EN, EN -> EN | http://www.festvox.org/cmu_arctic/ | | | aurora4 | Aurora-4 database | ASR | EN | http://aurora.hsnr.de/aurora-4.html | | | babel | IARPA Babel corups | ASR | ~20 Languages | https://www.iarpa.gov/index.php/research-programs/babel | | | blizzard_2017 | Blizzard Challenge 2017 | TTS | EN | https://www.synsig.org/index.php/Blizzard_Challenge_2017 | | @@ -22,6 +22,7 @@ See: https://espnet.github.io/espnet/tutorial.html | chime6 | The 6th CHiME Speech Separation and Recognition Challenge | ASR | EN | https://chimechallenge.github.io/chime6/ | | | cmu_wilderness | CMU Wilderness Multilingual Speech Dataset | Multilingual ASR | ~100 Languages | https://github.com/festvox/datasets-CMU_Wilderness | | | commonvoice | The Mozilla Common Voice | ASR | 13 Languages | https://voice.mozilla.org/datasets | | +| covost2 | CoVoST: A Large-Scale Multilingual Speech-To-Text Translation Corpus | ASR/Machine Translation/Speech Translation | 15+21 Language pairs | https://github.com/facebookresearch/covost | | | csj | Corpus of Spontaneous Japanese | ASR | JP | https://pj.ninjal.ac.jp/corpus_center/csj/en/ | | | csmsc | Chinese Standard Mandarin Speech Copus | TTS | ZH | https://www.data-baker.com/open_source.html | | | dipco | Dinner Party Corpus | ASR | EN | https://arxiv.org/abs/1909.13447 | | @@ -33,6 +34,9 @@ See: https://espnet.github.io/espnet/tutorial.html | hub4_spanish | 1997 Spanish Broadcast News Speech (HUB4-NE) | ASR | ES | https://catalog.ldc.upenn.edu/LDC98S74, https://catalog.ldc.upenn.edu/LDC98T29 | | | iwslt16 | International Workshop on Spoken Language Translation 2016 | Machine Translation | EN->DE | https://wit3.fbk.eu/mt.php?release=2016-01 | | | iwslt18 | International Workshop on Spoken Language Translation 2018 | ASR/Machine Translation/Speech Translation | EN->DE | https://sites.google.com/site/iwsltevaluation2018/Lectures-task | | +| iwslt19 | International Workshop on Spoken Language Translation 2019 | ASR/Speech Translation | EN->DE | https://sites.google.com/view/iwslt-evaluation-2019/speech-translation | +| iwslt21 | International Workshop on Spoken Language Translation 2021 | ASR/Machine Translation/Speech Translation | EN->DE | https://iwslt.org/2021/offline | +| iwslt21_low_resource | International Workshop on Spoken Language Translation 2021 | ASR/Speech Translation | SWA->EN & SWC->FR | https://iwslt.org/2021/low-resource | | jesc | Japanese-English Subtitle Corpus | Machine Translation | EN->JP | https://nlp.stanford.edu/projects/jesc/ | | | jnas | ASJ Japanese Newspaper Article Sentences Read Speech Corpus (JNAS) | ASR/TTS | JP | http://research.nii.ac.jp/src/JNAS.html | | | jsalt18e2e | Multilingual End-to-end ASR for Incomplete Data Benchmark | Multilingual ASR | ~20 Languages | https://www.clsp.jhu.edu/workshops/18-workshop/multilingual-end-end-asr-incomplete-data/ | babel+ | @@ -45,7 +49,10 @@ See: https://espnet.github.io/espnet/tutorial.html | librispeech | LibriSpeech ASR corpus | ASR | EN | http://www.openslr.org/12 | | | libritts | LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech | TTS | EN | http://www.openslr.org/60/ | | | ljspeech | The LJ Speech Dataset | TTS | EN | https://keithito.com/LJ-Speech-Dataset/ | | +| lrs | The Lip Reading Sentences Dataset | ASR/AVSR | EN | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html | | | m_ailabs | The M-AILABS Speech Dataset | TTS | ~5 languages | https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/ | +| mucs_2021 | MUCS 2021: MUltilingual and Code-Switching ASR Challenges for Low Resource Indian Languages | ASR/Code Switching | HI, MR, OR, TA, TE, GU, HI-EN, BN-EN | https://navana-tech.github.io/MUCS2021/data.html | | +| mtedx | Multilingual TEDx | ASR/Machine Translation/Speech Translation | 13 Language pairs | http://www.openslr.org/100/ | | must_c | Must-C Multilingual Speech Translation Corpus | ASR/Machine Translation/Speech Translation | EN->{DE, ES, FR, IT, NL, PT, RO, RU} | https://ict.fbk.eu/must-c/ | | | | must_c_v2 | Must-C Multilingual Speech Translation Corpus | ASR/Machine Translation/Speech Translation | EN->DE | https://ict.fbk.eu/must-c/ https://iwslt.org/2021/offline | More talks that result in 20k more audio/text segments. Improved cleaning strategies able to better discard low-quality triplets. TED talks of MuST-C v2 were downloaded from the YouTube TED channel. | | puebla_nahuatl | The Puebla-Nahuatl Corpus | ASR | Nahuatl | http://www.openslr.org/89 | | diff --git a/egs/aidatatang_200zh/asr1/cmd.sh b/egs/aidatatang_200zh/asr1/cmd.sh index 4d70c9c7a79..7b70ef5e06e 100644 --- a/egs/aidatatang_200zh/asr1/cmd.sh +++ b/egs/aidatatang_200zh/asr1/cmd.sh @@ -22,7 +22,7 @@ # If jobs failed, your configuration might be wrong for your environment. # # -# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl: +# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl: # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html # =========================================================~ @@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then # The default setting is written in conf/slurm.conf. # You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # To know the "partion" names, type "sinfo". - # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*" + # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". export train_cmd="slurm.pl" diff --git a/egs/aishell/asr1/RESULTS.md b/egs/aishell/asr1/RESULTS.md index 158645ba99d..6221ac11328 100644 --- a/egs/aishell/asr1/RESULTS.md +++ b/egs/aishell/asr1/RESULTS.md @@ -1,3 +1,82 @@ +# Conformer-Transducer with auxiliary task (CTC weight = 0.5) + +## Environments +- Same as RNN-Transducer (see below) + +## Config files +- preprocess config: `conf/specaug.yaml` +- train config: `conf/tuning/transducer/train_conformer-rnn_transducer_aux_ngpu4.yaml` +- lm config: `-` (LM was not used) +- decode config: `conf/tuning/transducer/decode_default.yaml` +- ngpu: `4` + +## Results (CER) +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_dev_decode_default|14326|205341|95.8|4.0|0.2|0.1|4.3|33.6| +|decode_test_decode_default|7176|104765|95.3|4.4|0.2|0.1|4.8|36.3| + + +# Conformer-Transducer + +## Environments +- Same as RNN-Transducer (see below) + +## Config files +- preprocess config: `conf/specaug.yaml` +- train config: `conf/tuning/transducer/train_conformer-rnn_transducer.yaml` +- lm config: `-` (LM was not used) +- decode config: `conf/tuning/transducer/decode_default.yaml` + +## Results (CER) +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_dev_decode_default|14326|205341|95.6|4.2|0.2|0.1|4.5|34.0| +|decode_test_decode_default|7176|104765|95.0|4.7|0.3|0.1|5.0|37.1| + + +# RNN-Transducer with auxiliary task (CTC weight = 0.1) + +## Environments +- Same as RNN-Transducer (see below) + +## Config files +- preprocess config: `conf/specaug.yaml` +- train config: `conf/tuning/transducer/train_transducer_aux.yaml` +- lm config: `-` (LM was not used) +- decode config: `conf/tuning/transducer/decode_default.yaml` + +## Results (CER) +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_dev_decode_default|14326|205341|93.9|5.8|0.3|0.1|6.3|41.9| +|decode_test_decode_default|7176|104765|93.2|6.5|0.4|0.1|6.9|44.5| + + +# RNN-Transducer + +## Environments +- date: `Thu May 20 05:29:03 UTC 2021` +- python version: `3.7.4 (default, Aug 13 2019, 20:35:49) [GCC 7.3.0]` +- espnet version: `espnet 0.9.8` +- chainer version: `chainer 6.0.0` +- pytorch version: `pytorch 1.6.0` +- Git hash: `95b3008cdcc2247e781a048bc999243dc7f45fe7` + - Commit date: `Sat Mar 6 00:48:29 2021 +0000` + +## Config files +- preprocess config: `conf/specaug.yaml` +- train config: `conf/tuning/transducer/train_transducer.yaml` +- lm config: `-` (LM was not used) +- decode config: `conf/tuning/transducer/decode_default.yaml` + +## Results (CER) +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_dev_decode_default|14326|205341|93.8|5.9|0.3|0.1|6.3|42.0| +|decode_test_decode_default|7176|104765|92.9|6.7|0.3|0.1|7.2|45.9| + + # Conformer (kernel size = 15) + SpecAugment + LM weight = 0.0 result - training config file: `conf/tuning/train_pytorch_conformer_kernel15.yaml` diff --git a/egs/aishell/asr1/cmd.sh b/egs/aishell/asr1/cmd.sh index 4d70c9c7a79..7b70ef5e06e 100644 --- a/egs/aishell/asr1/cmd.sh +++ b/egs/aishell/asr1/cmd.sh @@ -22,7 +22,7 @@ # If jobs failed, your configuration might be wrong for your environment. # # -# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl: +# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl: # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html # =========================================================~ @@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then # The default setting is written in conf/slurm.conf. # You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # To know the "partion" names, type "sinfo". - # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*" + # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". export train_cmd="slurm.pl" diff --git a/egs/aishell/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml b/egs/aishell/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml index 7d5effcca04..5a51f93ffda 100644 --- a/egs/aishell/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml +++ b/egs/aishell/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml @@ -43,7 +43,7 @@ transformer-init: pytorch transformer-encoder-pos-enc-layer-type: rel_pos transformer-encoder-selfattn-layer-type: rel_selfattn transformer-encoder-activation-type: swish -rel_pos_type: latest +rel-pos-type: latest macaron-style: true use-cnn-module: true cnn-module-kernel: 15 diff --git a/egs/aishell/asr1/conf/tuning/transducer/decode_default.yaml b/egs/aishell/asr1/conf/tuning/transducer/decode_default.yaml new file mode 100644 index 00000000000..b62b87b7f73 --- /dev/null +++ b/egs/aishell/asr1/conf/tuning/transducer/decode_default.yaml @@ -0,0 +1,5 @@ +# decoding parameters +batch: 0 +beam-size: 10 +search-type: default +score-norm: True diff --git a/egs/aishell/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml b/egs/aishell/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml new file mode 100644 index 00000000000..cfb84ec9732 --- /dev/null +++ b/egs/aishell/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml @@ -0,0 +1,52 @@ +# minibatch related +batch-size: 64 +maxlen-in: 512 +maxlen-out: 150 + +# optimization related +criterion: loss +early-stop-criterion: "validation/main/loss" +sortagrad: 0 +opt: noam +transformer-lr: 1.0 +transformer-warmup-steps: 25000 +epochs: 100 +patience: 0 +accum-grad: 2 +grad-clip: 5.0 + +# network architecture +## general +custom-enc-positional-encoding-type: rel_pos +custom-enc-self-attn-type: rel_self_attn +custom-enc-pw-activation-type: swish +## encoder related +etype: custom +custom-enc-input-layer: vgg2l +enc-block-arch: + - type: conformer + d_hidden: 512 + d_ff: 2048 + heads: 4 + macaron_style: True + use_conv_mod: True + conv_mod_kernel: 15 + dropout-rate: 0.3 + att-dropout-rate: 0.3 +enc-block-repeat: 12 +## decoder related +dtype: lstm +dlayers: 1 +dec-embed-dim: 1024 +dunits: 512 +dropout-rate-embed-decoder: 0.2 +dropout-rate-decoder: 0.1 +## joint network related +joint-dim: 512 + +# transducer related +model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E" + +# reporter related +report-wer: True +report-cer: True diff --git a/egs/aishell/asr1/conf/tuning/transducer/train_conformer-rnn_transducer_aux_ngpu4.yaml b/egs/aishell/asr1/conf/tuning/transducer/train_conformer-rnn_transducer_aux_ngpu4.yaml new file mode 100644 index 00000000000..28c37402b4b --- /dev/null +++ b/egs/aishell/asr1/conf/tuning/transducer/train_conformer-rnn_transducer_aux_ngpu4.yaml @@ -0,0 +1,57 @@ +# minibatch related +batch-size: 32 +maxlen-in: 512 +maxlen-out: 150 + +# optimization related +criterion: loss +early-stop-criterion: "validation/main/loss" +sortagrad: 0 +opt: noam +transformer-lr: 1.0 +transformer-warmup-steps: 25000 +epochs: 100 +patience: 0 +#accum-grad: 2 +grad-clip: 5.0 + +# network architecture +## general +custom-enc-positional-encoding-type: rel_pos +custom-enc-self-attn-type: rel_self_attn +custom-enc-pw-activation-type: swish +## encoder related +etype: custom +custom-enc-input-layer: vgg2l +enc-block-arch: + - type: conformer + d_hidden: 512 + d_ff: 2048 + heads: 4 + macaron_style: True + use_conv_mod: True + conv_mod_kernel: 15 + dropout-rate: 0.3 + att-dropout-rate: 0.3 +enc-block-repeat: 12 +## decoder related +dtype: lstm +dlayers: 1 +dec-embed-dim: 1024 +dunits: 512 +dropout-rate-embed-decoder: 0.2 +dropout-rate-decoder: 0.1 +## joint network related +joint-dim: 512 + +# transducer related +model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E" + +# reporter related +report-wer: True +report-cer: True + +# auxiliary task +aux-ctc: True +aux-ctc-weight: 0.5 +aux-ctc-dropout-rate: 0.1 diff --git a/egs/aishell/asr1/conf/tuning/transducer/train_transducer.yaml b/egs/aishell/asr1/conf/tuning/transducer/train_transducer.yaml new file mode 100644 index 00000000000..c8be66354fc --- /dev/null +++ b/egs/aishell/asr1/conf/tuning/transducer/train_transducer.yaml @@ -0,0 +1,37 @@ +# minibatch related +batch-size: 64 +maxlen-in: 512 +maxlen-out: 150 + +# optimization related +criterion: loss +early-stop-criterion: "validation/main/loss" +sortagrad: 0 +opt: adadelta +epochs: 30 +patience: 3 +accum-grad: 2 + +# network architecture +## encoder related +etype: vggblstm +elayers: 6 +eunits: 512 +eprojs: 512 +dropout-rate: 0.4 +## decoder related +dtype: lstm +dlayers: 1 +dec-embed-dim: 1024 +dunits: 512 +dropout-rate-embed-decoder: 0.2 +dropout-rate-decoder: 0.1 +## joint network related +joint-dim: 512 + +# transducer related +model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E" + +# reporter related +report-wer: True +report-cer: True diff --git a/egs/aishell/asr1/conf/tuning/transducer/train_transducer_aux.yaml b/egs/aishell/asr1/conf/tuning/transducer/train_transducer_aux.yaml new file mode 100644 index 00000000000..9c3fc715bc7 --- /dev/null +++ b/egs/aishell/asr1/conf/tuning/transducer/train_transducer_aux.yaml @@ -0,0 +1,42 @@ +# minibatch related +batch-size: 64 +maxlen-in: 512 +maxlen-out: 150 + +# optimization related +criterion: loss +early-stop-criterion: "validation/main/loss" +sortagrad: 0 +opt: adadelta +epochs: 30 +patience: 3 +accum-grad: 2 + +# network architecture +## encoder related +etype: vggblstm +elayers: 6 +eunits: 512 +eprojs: 512 +dropout-rate: 0.4 +## decoder related +dtype: lstm +dlayers: 1 +dec-embed-dim: 1024 +dunits: 512 +dropout-rate-embed-decoder: 0.2 +dropout-rate-decoder: 0.1 +## joint network related +joint-dim: 512 + +# transducer related +model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E" + +# reporter related +report-wer: True +report-cer: True + +# auxiliary task +aux-ctc: True +aux-ctc-weight: 0.1 +aux-ctc-dropout-rate: 0.1 diff --git a/egs/aishell/asr1/run.sh b/egs/aishell/asr1/run.sh index 3f92e8bac00..a19805ee0cd 100755 --- a/egs/aishell/asr1/run.sh +++ b/egs/aishell/asr1/run.sh @@ -241,6 +241,15 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --out ${expdir}/results/${recog_model} \ --num ${n_average} fi + + if [[ $(get_yaml.py ${train_config} model-module) = *transducer* ]]; then + echo "[info]: transducer model does not support '--api v2'" \ + "(hence ngram is ignored)" + recog_v2_opts="" + else + recog_v2_opts="--ngram-model ${ngramexpdir}/${n_gram}gram.bin --api v2" + fi + pids=() # initialize pids for rtask in ${recog_set}; do ( @@ -263,8 +272,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --result-label ${expdir}/${decode_dir}/data.JOB.json \ --model ${expdir}/results/${recog_model} \ --rnnlm ${lmexpdir}/rnnlm.model.best \ - --ngram-model ${ngramexpdir}/${n_gram}gram.bin \ - --api v2 + ${recog_v2_opts} score_sclite.sh ${expdir}/${decode_dir} ${dict} diff --git a/egs/aishell2/asr1/cmd.sh b/egs/aishell2/asr1/cmd.sh index 4d70c9c7a79..7b70ef5e06e 100755 --- a/egs/aishell2/asr1/cmd.sh +++ b/egs/aishell2/asr1/cmd.sh @@ -22,7 +22,7 @@ # If jobs failed, your configuration might be wrong for your environment. # # -# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl: +# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl: # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html # =========================================================~ @@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then # The default setting is written in conf/slurm.conf. # You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # To know the "partion" names, type "sinfo". - # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*" + # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". export train_cmd="slurm.pl" diff --git a/egs/ami/asr1/cmd.sh b/egs/ami/asr1/cmd.sh index 4d70c9c7a79..7b70ef5e06e 100644 --- a/egs/ami/asr1/cmd.sh +++ b/egs/ami/asr1/cmd.sh @@ -22,7 +22,7 @@ # If jobs failed, your configuration might be wrong for your environment. # # -# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl: +# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl: # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html # =========================================================~ @@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then # The default setting is written in conf/slurm.conf. # You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # To know the "partion" names, type "sinfo". - # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*" + # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". export train_cmd="slurm.pl" diff --git a/egs/ami/asr1/local/ami_download.sh b/egs/ami/asr1/local/ami_download.sh index bae72d1716a..0c19b24b9ad 100755 --- a/egs/ami/asr1/local/ami_download.sh +++ b/egs/ami/asr1/local/ami_download.sh @@ -103,7 +103,7 @@ else fi fi -echo "Downloads of AMI corpus completed succesfully. License can be found under $adir/LICENCE.TXT" +echo "Downloads of AMI corpus completed successfully. License can be found under $adir/LICENCE.TXT" exit 0; diff --git a/egs/ami/asr1/local/ami_ihm_scoring_data_prep.sh b/egs/ami/asr1/local/ami_ihm_scoring_data_prep.sh index f8420041362..a0b6470fb87 100755 --- a/egs/ami/asr1/local/ami_ihm_scoring_data_prep.sh +++ b/egs/ami/asr1/local/ami_ihm_scoring_data_prep.sh @@ -88,7 +88,7 @@ awk '{print $1}' $dir/segments | \ sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; #check and correct the case when segment timings for given speaker overlap themself -#(important for simulatenous asclite scoring to proceed). +#(important for simultaneous asclite scoring to proceed). #There is actually only one such case for devset and automatic segmentetions join $dir/utt2spk $dir/segments | \ perl -ne '{BEGIN{$pu=""; $pt=0.0;} split; diff --git a/egs/ami/asr1/local/ami_mdm_scoring_data_prep.sh b/egs/ami/asr1/local/ami_mdm_scoring_data_prep.sh index 7d4d963f688..8dc96e52318 100755 --- a/egs/ami/asr1/local/ami_mdm_scoring_data_prep.sh +++ b/egs/ami/asr1/local/ami_mdm_scoring_data_prep.sh @@ -94,7 +94,7 @@ awk '{print $1}' $tmpdir/segments | \ print "$1$2$3 $1$2\n";' > $tmpdir/utt2spk_stm || exit 1; #check and correct case when segment timings for a given speaker overlap themself -#(important for simulatenous asclite scoring to proceed). +#(important for simultaneous asclite scoring to proceed). #There is actually only one such case for devset and automatic segmentetions join $tmpdir/utt2spk_stm $tmpdir/segments | \ awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5; @@ -122,7 +122,7 @@ for f in spk2utt utt2spk utt2spk_stm wav.scp text segments reco2file_and_channel done cp local/english.glm $dir/glm -#note, although utt2spk contains mappings to the whole meetings for simulatenous scoring +#note, although utt2spk contains mappings to the whole meetings for simultaneous scoring #we need to know which speakers overlap at meeting level, hence we generate an extra utt2spk_stm file local/convert2stm.pl $dir utt2spk_stm > $dir/stm diff --git a/egs/ami/asr1/local/ami_sdm_scoring_data_prep.sh b/egs/ami/asr1/local/ami_sdm_scoring_data_prep.sh index b0a656d1444..a2be3cd695a 100755 --- a/egs/ami/asr1/local/ami_sdm_scoring_data_prep.sh +++ b/egs/ami/asr1/local/ami_sdm_scoring_data_prep.sh @@ -106,7 +106,7 @@ awk '{print $1}' $tmpdir/segments | \ > $tmpdir/utt2spk_stm || exit 1; #check and correct the case when segment timings for given speaker overlap themself -#(important for simulatenous asclite scoring to proceed). +#(important for simultaneous asclite scoring to proceed). #There is actually only one such case for devset and automatic segmentetions join $tmpdir/utt2spk_stm $tmpdir/segments | \ awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5; diff --git a/egs/an4/asr1/cmd.sh b/egs/an4/asr1/cmd.sh index 4d70c9c7a79..7b70ef5e06e 100644 --- a/egs/an4/asr1/cmd.sh +++ b/egs/an4/asr1/cmd.sh @@ -22,7 +22,7 @@ # If jobs failed, your configuration might be wrong for your environment. # # -# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl: +# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl: # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html # =========================================================~ @@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then # The default setting is written in conf/slurm.conf. # You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # To know the "partion" names, type "sinfo". - # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*" + # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". export train_cmd="slurm.pl" diff --git a/egs/an4/tts1/cmd.sh b/egs/an4/tts1/cmd.sh index 4d70c9c7a79..7b70ef5e06e 100644 --- a/egs/an4/tts1/cmd.sh +++ b/egs/an4/tts1/cmd.sh @@ -22,7 +22,7 @@ # If jobs failed, your configuration might be wrong for your environment. # # -# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl: +# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl: # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html # =========================================================~ @@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then # The default setting is written in conf/slurm.conf. # You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # To know the "partion" names, type "sinfo". - # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*" + # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". export train_cmd="slurm.pl" diff --git a/egs/arctic/tts1/cmd.sh b/egs/arctic/tts1/cmd.sh index 4d70c9c7a79..7b70ef5e06e 100644 --- a/egs/arctic/tts1/cmd.sh +++ b/egs/arctic/tts1/cmd.sh @@ -22,7 +22,7 @@ # If jobs failed, your configuration might be wrong for your environment. # # -# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl: +# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl: # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html # =========================================================~ @@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then # The default setting is written in conf/slurm.conf. # You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # To know the "partion" names, type "sinfo". - # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*" + # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". export train_cmd="slurm.pl" diff --git a/egs/arctic/tts1/conf/train_pytorch_tacotron2.v3.finetune.yaml b/egs/arctic/tts1/conf/train_pytorch_tacotron2.v3.finetune.yaml index 70f84e663b3..ee46dcaf412 100644 --- a/egs/arctic/tts1/conf/train_pytorch_tacotron2.v3.finetune.yaml +++ b/egs/arctic/tts1/conf/train_pytorch_tacotron2.v3.finetune.yaml @@ -1,5 +1,5 @@ # This configuration uses reduction factor = 1 and location-sensitive attention. -# Furthermore, to accelerate the learning of diaogonal attention, we additionaly +# Furthermore, to accelerate the learning of diaogonal attention, we additionally # use guided attention loss. This leads super fast and robust attention learning. # encoder related diff --git a/egs/arctic/tts1/local/data_download.sh b/egs/arctic/tts1/local/data_download.sh index e7fb368be7f..18da617f74e 100755 --- a/egs/arctic/tts1/local/data_download.sh +++ b/egs/arctic/tts1/local/data_download.sh @@ -1,4 +1,5 @@ -#!/usr/bin/env bash -e +#!/usr/bin/env bash +set -e # Copyright 2019 Nagoya University (Tomoki Hayashi) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) diff --git a/egs/arctic/tts1/local/data_prep.sh b/egs/arctic/tts1/local/data_prep.sh index d087c2f9f9d..3d23e19a720 100755 --- a/egs/arctic/tts1/local/data_prep.sh +++ b/egs/arctic/tts1/local/data_prep.sh @@ -1,4 +1,5 @@ -#!/usr/bin/env bash -e +#!/usr/bin/env bash +set -e # Copyright 2019 Nagoya University (Tomoki Hayashi) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) diff --git a/egs/arctic/tts1/local/pretrained_model_download.sh b/egs/arctic/tts1/local/pretrained_model_download.sh index 89698164812..cd01a43faec 100755 --- a/egs/arctic/tts1/local/pretrained_model_download.sh +++ b/egs/arctic/tts1/local/pretrained_model_download.sh @@ -1,4 +1,5 @@ -#!/usr/bin/env bash -e +#!/usr/bin/env bash +set -e # Copyright 2019 Nagoya University (Tomoki Hayashi) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) @@ -34,4 +35,4 @@ if [ ! -e ${dir}/.complete ]; then download_from_google_drive.sh ${share_url} ${dir} ".tar.gz" touch ${dir}/.complete fi -echo "Successfully finished donwload of pretrained model." +echo "Successfully finished download of pretrained model." diff --git a/egs/arctic/vc1/cmd.sh b/egs/arctic/vc1/cmd.sh index 4d70c9c7a79..7b70ef5e06e 100644 --- a/egs/arctic/vc1/cmd.sh +++ b/egs/arctic/vc1/cmd.sh @@ -22,7 +22,7 @@ # If jobs failed, your configuration might be wrong for your environment. # # -# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl: +# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl: # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html # =========================================================~ @@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then # The default setting is written in conf/slurm.conf. # You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # To know the "partion" names, type "sinfo". - # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*" + # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". export train_cmd="slurm.pl" diff --git a/egs/arctic/vc1/conf/tuning/train_pytorch_transformer.tts_pt.v1.1.single.yaml b/egs/arctic/vc1/conf/tuning/train_pytorch_transformer.tts_pt.v1.1.single.yaml index d1d266a3329..ca1b7a12938 100644 --- a/egs/arctic/vc1/conf/tuning/train_pytorch_transformer.tts_pt.v1.1.single.yaml +++ b/egs/arctic/vc1/conf/tuning/train_pytorch_transformer.tts_pt.v1.1.single.yaml @@ -22,7 +22,7 @@ use-masking: True bce-pos-weight: 5.0 use-batch-norm: True use-scaled-pos-enc: True -encoder-normalize-before: False +encoder-normalize-before: True decoder-normalize-before: False encoder-concat-after: False decoder-concat-after: False diff --git a/egs/arctic/vc1/local/pretrained_model_download.sh b/egs/arctic/vc1/local/pretrained_model_download.sh index b15be3ba196..cdfb8a41d8c 100755 --- a/egs/arctic/vc1/local/pretrained_model_download.sh +++ b/egs/arctic/vc1/local/pretrained_model_download.sh @@ -1,4 +1,5 @@ -#!/usr/bin/env bash -e +#!/usr/bin/env bash +set -e # Copyright 2020 Nagoya University (Wen-Chin Huang) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) @@ -31,4 +32,4 @@ if [ ! -e ${dir}/.complete ]; then download_from_google_drive.sh ${share_url} ${dir} "tar.gz" touch ${dir}/.complete fi -echo "Successfully finished donwload of pretrained model." +echo "Successfully finished download of pretrained model." diff --git a/egs/aurora4/asr1/cmd.sh b/egs/aurora4/asr1/cmd.sh index 4d70c9c7a79..7b70ef5e06e 100644 --- a/egs/aurora4/asr1/cmd.sh +++ b/egs/aurora4/asr1/cmd.sh @@ -22,7 +22,7 @@ # If jobs failed, your configuration might be wrong for your environment. # # -# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl: +# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl: # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html # =========================================================~ @@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then # The default setting is written in conf/slurm.conf. # You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # To know the "partion" names, type "sinfo". - # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*" + # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". export train_cmd="slurm.pl" diff --git a/egs/babel/asr1/README.md b/egs/babel/asr1/README.md index e8cb946cb64..60d2d98cb5a 100644 --- a/egs/babel/asr1/README.md +++ b/egs/babel/asr1/README.md @@ -62,7 +62,7 @@ To run the experiment do `cd ../expname` -To specify the BABEL langauges in training refer to them by their language id. +To specify the BABEL languages in training refer to them by their language id. See conf/lang.conf for the exhaustive list of languages and corresponding language ids. diff --git a/egs/babel/asr1/cmd.sh b/egs/babel/asr1/cmd.sh index 4d70c9c7a79..7b70ef5e06e 100644 --- a/egs/babel/asr1/cmd.sh +++ b/egs/babel/asr1/cmd.sh @@ -22,7 +22,7 @@ # If jobs failed, your configuration might be wrong for your environment. # # -# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl: +# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl: # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html # =========================================================~ @@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then # The default setting is written in conf/slurm.conf. # You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # To know the "partion" names, type "sinfo". - # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*" + # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". export train_cmd="slurm.pl" diff --git a/egs/babel/asr1/conf/lang.conf b/egs/babel/asr1/conf/lang.conf index ae6fa9d4593..7b2960a7231 100644 --- a/egs/babel/asr1/conf/lang.conf +++ b/egs/babel/asr1/conf/lang.conf @@ -1,7 +1,7 @@ # A giant configurations file for all the BABEL languages # as well as some training configurations for training HMM-GMM systems # for obtaining phoneme level alignments if you really want to do that -# All paths starting with /export/* are set for the JHU/CLSP grid and shoudl +# All paths starting with /export/* are set for the JHU/CLSP grid and should # be changed appropriately for other users # Cantonese diff --git a/egs/babel/asr1/local/run_all.sh b/egs/babel/asr1/local/run_all.sh index d04d04c6f2f..3359508b675 100755 --- a/egs/babel/asr1/local/run_all.sh +++ b/egs/babel/asr1/local/run_all.sh @@ -8,7 +8,7 @@ for x in 101-cantonese 102-assamese 103-bengali 104-pashto 105-turkish 106-tagal ./setup_experiment.sh asr1_${lang} pushd ../asr1_${lang} ./run.sh --langs $langid --recog $langid --ngpu 1 & - sleep 20m # to avoid too many disk access happend at the same time + sleep 20m # to avoid too many disk access happened at the same time popd done diff --git a/egs/blizzard17/tts1/cmd.sh b/egs/blizzard17/tts1/cmd.sh index 4d70c9c7a79..7b70ef5e06e 100644 --- a/egs/blizzard17/tts1/cmd.sh +++ b/egs/blizzard17/tts1/cmd.sh @@ -22,7 +22,7 @@ # If jobs failed, your configuration might be wrong for your environment. # # -# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl: +# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl: # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html # =========================================================~ @@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then # The default setting is written in conf/slurm.conf. # You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # To know the "partion" names, type "sinfo". - # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*" + # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". export train_cmd="slurm.pl" diff --git a/egs/blizzard17/tts1/conf/tuning/train_pytorch_tacotron2.tuning.lab3-rev3.yaml b/egs/blizzard17/tts1/conf/tuning/train_pytorch_tacotron2.tuning.lab3-rev3.yaml index 504d62846b3..f7c02fc2405 100755 --- a/egs/blizzard17/tts1/conf/tuning/train_pytorch_tacotron2.tuning.lab3-rev3.yaml +++ b/egs/blizzard17/tts1/conf/tuning/train_pytorch_tacotron2.tuning.lab3-rev3.yaml @@ -1,5 +1,5 @@ # To make the attention wight diagonal in decoding, we use forward attention. -# Futhermore, we use reduction-fucter :3 to generate clear speech. +# Furthermore, we use reduction-fucter :3 to generate clear speech. # encoder related embed-dim: 512 diff --git a/egs/chime4/asr1/cmd.sh b/egs/chime4/asr1/cmd.sh index 4d70c9c7a79..7b70ef5e06e 100644 --- a/egs/chime4/asr1/cmd.sh +++ b/egs/chime4/asr1/cmd.sh @@ -22,7 +22,7 @@ # If jobs failed, your configuration might be wrong for your environment. # # -# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl: +# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl: # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html # =========================================================~ @@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then # The default setting is written in conf/slurm.conf. # You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # To know the "partion" names, type "sinfo". - # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*" + # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". export train_cmd="slurm.pl" diff --git a/egs/chime4/asr1/run.sh b/egs/chime4/asr1/run.sh index 38196c45088..85c4e4a64eb 100755 --- a/egs/chime4/asr1/run.sh +++ b/egs/chime4/asr1/run.sh @@ -72,7 +72,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ${chime4_data}/data/audio/16kHz/isolated_2ch_track enhan/beamformit_2mics local/run_beamform_6ch_track.sh --cmd "${train_cmd}" --nj 20 \ ${chime4_data}/data/audio/16kHz/isolated_6ch_track enhan/beamformit_5mics - echo "prepartion for chime4 data" + echo "preparation for chime4 data" local/real_noisy_chime4_data_prep.sh ${chime4_data} local/simu_noisy_chime4_data_prep.sh ${chime4_data} echo "test data for 1ch track" diff --git a/egs/chime4/asr1_multich/cmd.sh b/egs/chime4/asr1_multich/cmd.sh index 4d70c9c7a79..7b70ef5e06e 100644 --- a/egs/chime4/asr1_multich/cmd.sh +++ b/egs/chime4/asr1_multich/cmd.sh @@ -22,7 +22,7 @@ # If jobs failed, your configuration might be wrong for your environment. # # -# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl: +# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl: # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html # =========================================================~ @@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then # The default setting is written in conf/slurm.conf. # You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # To know the "partion" names, type "sinfo". - # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*" + # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". export train_cmd="slurm.pl" diff --git a/egs/chime4/asr1_multich/run.sh b/egs/chime4/asr1_multich/run.sh index 28651c03b59..a0f395e6193 100755 --- a/egs/chime4/asr1_multich/run.sh +++ b/egs/chime4/asr1_multich/run.sh @@ -61,7 +61,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then wsj0_data=${chime4_data}/data/WSJ0 local/clean_wsj0_data_prep.sh ${wsj0_data} local/clean_chime4_format_data.sh - echo "prepartion for chime4 data" + echo "preparation for chime4 data" local/real_noisy_chime4_data_prep.sh ${chime4_data} local/simu_noisy_chime4_data_prep.sh ${chime4_data} local/bth_chime4_data_prep.sh ${chime4_data} diff --git a/egs/chime5/asr1/cmd.sh b/egs/chime5/asr1/cmd.sh index 4d70c9c7a79..7b70ef5e06e 100644 --- a/egs/chime5/asr1/cmd.sh +++ b/egs/chime5/asr1/cmd.sh @@ -22,7 +22,7 @@ # If jobs failed, your configuration might be wrong for your environment. # # -# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl: +# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl: # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html # =========================================================~ @@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then # The default setting is written in conf/slurm.conf. # You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # To know the "partion" names, type "sinfo". - # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*" + # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". export train_cmd="slurm.pl" diff --git a/egs/chime6/asr1/cmd.sh b/egs/chime6/asr1/cmd.sh index 4d70c9c7a79..7b70ef5e06e 100644 --- a/egs/chime6/asr1/cmd.sh +++ b/egs/chime6/asr1/cmd.sh @@ -22,7 +22,7 @@ # If jobs failed, your configuration might be wrong for your environment. # # -# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl: +# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl: # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html # =========================================================~ @@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then # The default setting is written in conf/slurm.conf. # You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # To know the "partion" names, type "sinfo". - # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*" + # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". export train_cmd="slurm.pl" diff --git a/egs/chime6/asr1/local/install_pb_chime5.sh b/egs/chime6/asr1/local/install_pb_chime5.sh index 430edb6810d..3a3805daff2 100755 --- a/egs/chime6/asr1/local/install_pb_chime5.sh +++ b/egs/chime6/asr1/local/install_pb_chime5.sh @@ -8,7 +8,7 @@ cd pb_chime5 git submodule init git submodule update -# sudo apt install libopenmpi-dev -- if you have problem with mpi4py instalation +# sudo apt install libopenmpi-dev -- if you have problem with mpi4py installation python -m pip install cython python -m pip install pymongo diff --git a/egs/cmu_indic/tts1/cmd.sh b/egs/cmu_indic/tts1/cmd.sh new file mode 100644 index 00000000000..7b70ef5e06e --- /dev/null +++ b/egs/cmu_indic/tts1/cmd.sh @@ -0,0 +1,89 @@ +# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== +# Usage: .pl [options] JOB=1: +# e.g. +# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB +# +# Options: +# --time