diff --git a/.circleci/config.yml b/.circleci/config.yml index 8aba5c755f..96fc11b44e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -25,17 +25,15 @@ jobs: - checkout - run: | sudo apt update - sudo apt install espeak git + sudo apt install espeak-ng git - run: sudo pip install --upgrade pip - run: sudo pip install -e . - run: | sudo pip install --quiet --upgrade cardboardlint pylint cardboardlinter --refspec ${CIRCLE_BRANCH} -n auto - - run: nosetests tests --nocapture + - run: nosetests tests --nocapture --processes=0 --process-timeout=20 --process-restartworker - run: | - sudo ./tests/test_server_package.sh sudo ./tests/test_glow-tts_train.sh - sudo ./tests/test_server_package.sh sudo ./tests/test_tacotron_train.sh sudo ./tests/test_vocoder_gan_train.sh sudo ./tests/test_vocoder_wavegrad_train.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000000..a5b22f5b00 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,60 @@ +name: Test + +on: + push: + branches: + - master + - dev + pull_request: + types: [opened, synchronize, reopened] +jobs: + check_skip: + runs-on: ubuntu-latest + if: "! contains(github.event.head_commit.message, '[ci skip]')" + steps: + - run: echo "${{ github.event.head_commit.message }}" + + test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: [3.6, 3.7, 3.8] + + steps: + - uses: actions/checkout@v2 + - uses: actions/cache@v1 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('**/setup.py') }} + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: check OS + run: cat /etc/os-release + - name: Install dependencies + run: | + sudo apt update + sudo apt install espeak-ng git + - name: Upgrade pip + # so we can take advantage of pyproject.toml build-dependency support + run: python3 -m pip install --upgrade pip + - name: Install TTS + run: | + python3 -m pip install . + python3 setup.py egg_info + - name: Lint check + run: | + cardboardlinter -n auto + - name: Unit tests + run: nosetests tests --nocapture --processes=0 --process-timeout=20 --process-restartworker + - name: Test scripts + run: | + ./tests/test_demo_server.sh + ./tests/test_glow-tts_train.sh + ./tests/test_tacotron_train.sh + ./tests/test_vocoder_gan_train.sh + ./tests/test_vocoder_wavegrad_train.sh + ./tests/test_vocoder_wavernn_train.sh + ./tests/test_speedy_speech_train.sh diff --git a/.gitignore b/.gitignore index 579bfbea10..f797d80d76 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,4 @@ TODO.txt data/* notebooks/data/* TTS/tts/layers/glow_tts/monotonic_align/core.c +temp_build/* \ No newline at end of file diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 3b6d813c20..b80639d63c 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,19 +1,133 @@ -# Ethical Notice -Please consider possible consequences and be mindful of any adversarial use cases of this project. In this regard, please contact us if you have any concerns. +# Contributor Covenant Code of Conduct -# Community Participation Guidelines +## Our Pledge -This repository is governed by Mozilla's code of conduct and etiquette guidelines. -For more details, please read the -[Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual identity +and orientation. -## How to Report -For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page. +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. - +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +coc-report@coqui.ai. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0]. + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder][Mozilla CoC]. + +For answers to common questions about this code of conduct, see the FAQ at +[https://www.contributor-covenant.org/faq][FAQ]. Translations are available +at [https://www.contributor-covenant.org/translations][translations]. + +[homepage]: https://www.contributor-covenant.org +[v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html +[Mozilla CoC]: https://github.com/mozilla/diversity +[FAQ]: https://www.contributor-covenant.org/faq +[translations]: https://www.contributor-covenant.org/translations diff --git a/README.md b/README.md index f8e8ef30b0..3232edbdda 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,18 @@ # -TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality. -TTS comes with [pretrained models](https://github.com/coqui-ai/TTS/wiki/Released-Models), tools for measuring dataset quality and already used in **20+ languages** for products and research projects. +:frog: TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality. +:frog: TTS comes with [pretrained models](https://github.com/coqui-ai/TTS/wiki/Released-Models), tools for measuring dataset quality and already used in **20+ languages** for products and research projects. [![License]()](https://opensource.org/licenses/MPL-2.0) [![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS) +[![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md) -:loudspeaker: [English Voice Samples](https://erogol.github.io/ddc-samples/) and [SoundCloud playlist](https://soundcloud.com/user-565970875/pocket-article-wavernn-and-tacotron2) +📢 [English Voice Samples](https://erogol.github.io/ddc-samples/) and [SoundCloud playlist](https://soundcloud.com/user-565970875/pocket-article-wavernn-and-tacotron2) -:man_cook: [TTS training recipes](https://github.com/erogol/TTS_recipes) +👩🏽‍🍳 [TTS training recipes](https://github.com/erogol/TTS_recipes) -:page_facing_up: [Text-to-Speech paper collection](https://github.com/erogol/TTS-papers) +📄 [Text-to-Speech paper collection](https://github.com/erogol/TTS-papers) ## 💬 Where to ask questions Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly, so that more people can benefit from it. @@ -87,24 +88,29 @@ Underlined "TTS*" and "Judy*" are :frog:TTS models - WaveRNN: [origin](https://github.com/fatchord/WaveRNN/) - WaveGrad: [paper](https://arxiv.org/abs/2009.00713) -You can also help us implement more models. Some TTS related work can be found [here](https://github.com/erogol/TTS-papers). +You can also help us implement more models. Some :frog: TTS related work can be found [here](https://github.com/erogol/TTS-papers). ## Install TTS -TTS is tested on Ubuntu 18.04 with **python >= 3.6, < 3.9**. +:frog: TTS is tested on Ubuntu 18.04 with **python >= 3.6, < 3.9**. -If you are only interested in [synthesizing speech](https://github.com/coqui-ai/TTS/tree/dev#example-synthesizing-speech-on-terminal-using-the-released-models) with the released TTS models, installing from PyPI is the easiest option. +If you are only interested in [synthesizing speech](https://github.com/coqui-ai/TTS/tree/dev#example-synthesizing-speech-on-terminal-using-the-released-models) with the released :frog: TTS models, installing from PyPI is the easiest option. ```bash pip install TTS ``` -If you plan to code or train models, clone TTS and install it locally. +If you plan to code or train models, clone :frog: TTS and install it locally. ```bash git clone https://github.com/coqui-ai/TTS pip install -e . ``` +We use ```espeak-ng``` to convert graphemes to phonemes. You might need to install separately. +```bash +sudo apt-get install espeak-ng +``` + ## Directory Structure ``` |- notebooks/ (Jupyter Notebooks for model evaluation, parameter selection and data analysis.) @@ -136,11 +142,11 @@ Audio examples: [soundcloud](https://soundcloud.com/user-565970875/pocket-articl example_output ## Datasets and Data-Loading -TTS provides a generic dataloader easy to use for your custom dataset. +:frog: TTS provides a generic dataloader easy to use for your custom dataset. You just need to write a simple function to format the dataset. Check ```datasets/preprocess.py``` to see some examples. After that, you need to set ```dataset``` fields in ```config.json```. -Some of the public datasets that we successfully applied TTS: +Some of the public datasets that we successfully applied :frog: TTS: - [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) - [Nancy](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/) @@ -151,9 +157,9 @@ Some of the public datasets that we successfully applied TTS: ## Example: Synthesizing Speech on Terminal Using the Released Models. -After the installation, TTS provides a CLI interface for synthesizing speech using pre-trained models. You can either use your own model or the release models under the TTS project. +After the installation, :frog: TTS provides a CLI interface for synthesizing speech using pre-trained models. You can either use your own model or the release models under :frog: TTS. -Listing released TTS models. +Listing released :frog: TTS models. ```bash tts --list_models ``` @@ -230,25 +236,28 @@ In case of any error or intercepted execution, if there is no checkpoint yet und You can also enjoy Tensorboard, if you point Tensorboard argument```--logdir``` to the experiment folder. ## Contribution guidelines -Please send your Pull Request to ```dev``` branch. Before making a Pull Request, check your changes for basic mistakes and style problems by using a linter. We have cardboardlinter setup in this repository, so for example, if you've made some changes and would like to run the linter on just the changed code, you can use the follow command: +Please follow the steps below as you send a PR for :frog:. It helps us to keep things organized. +1. Create a new branch. +2. Implement your changes. +3. (if applicable) Add [Google Style](https://google.github.io/styleguide/pyguide.html#381-docstrings) docstrings. +4. (if applicable) Implement a test case under ```tests``` folder. +5. (Optional but Prefered) Run tests. +```bash +./run_tests.sh +``` +6. Run the linter. ```bash pip install pylint cardboardlint cardboardlinter --refspec master ``` 7. Send a PR to ```dev``` branch, explain what the change is about. -8. Let us discuss until we make it perfect :). +8. Let us discuss until we make it perfect :) 💪. 9. We merge it to the ```dev``` branch once things look good. Feel free to ping us at any step you need help using our communication channels. +[Here](https://github.com/firstcontributions/first-contributions) is a good resource for complete beginners. -## Collaborative Experimentation Guide -If you like to use TTS to try a new idea and like to share your experiments with the community, we urge you to use the following guideline for a better collaboration. -(If you have an idea for better collaboration, let us know) -- Create a new branch. -- Open an issue pointing your branch. -- Explain your idea and experiment. -- Share your results regularly. (Tensorboard log files, audio results, visuals etc.) ### Acknowledgement - https://github.com/keithito/tacotron (Dataset pre-processing) diff --git a/TTS/.models.json b/TTS/.models.json index 075861dbe7..6c7dfbc514 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -1,77 +1,146 @@ { "tts_models":{ "en":{ + "ek1":{ + "tacotron2": { + "description": "EK1 en-rp tacotron2 by NMStoker", + "model_file": "1OJ5sLYmB03dQAf1FcY06b5X-0hiR0SNZ", + "config_file": "1hSnodL--5AFJTWvlU96e0pCnCfNU3yM_", + "stats_file": null, + "default_vocoder": "vocoder_models/en/ek1/wavegrad", + "commit": "c802255" + } + }, "ljspeech":{ "glow-tts":{ "description": "", - "model_file": "1NFsfhH8W8AgcfJ-BsL8CYAwQfZ5k4T-n", - "config_file": "1IAROF3yy9qTK43vG_-R67y3Py9yYbD6t", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.9/tts_models--en--ljspeech--glow-tts.zip", "stats_file": null, - "commit": "" + "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", + "commit": "", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact":"egolge@coqui.com" }, "tacotron2-DCA": { "description": "", - "model_file": "1CFoPDQBnhfBFu2Gc0TBSJn8o-TuNKQn7", - "config_file": "1lWSscNfKet1zZSJCNirOn7v9bigUZ8C1", - "stats_file": "1qevpGRVHPmzfiRBNuugLMX62x1k7B5vK", - "commit": "" + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.9/tts_models--en--ljspeech--tacotron2-DCA.zip", + "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", + "commit": "", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact":"egolge@coqui.com" }, "speedy-speech-wn":{ "description": "Speedy Speech model with wavenet decoder.", "model_file": "1VXAwiq6N-Viq3rsSXlf43bdoi0jSvMAJ", "config_file": "1KvZilhsNP3EumVggDcD46yd834eO5hR3", "stats_file": "1Ju7apZ5JlgsVECcETL-GEx3DRoNzWfkR", - "commit": "77b6145" + "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", + "commit": "77b6145", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact":"egolge@coqui.com" } } }, "es":{ "mai":{ "tacotron2-DDC":{ - "model_file": "1jZ4HvYcAXI5ZClke2iGA7qFQQJBXIovw", - "config_file": "1s7g4n-B73ChCB48AQ88_DV_8oyLth8r0", - "stats_file": "13st0CZ743v6Br5R5Qw_lH1OPQOr3M-Jv", - "commit": "" + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.9/tts_models--es--mai--tacotron2-DDC.zip", + "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", + "commit": "", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact":"egolge@coqui.com" } } }, "fr":{ "mai":{ "tacotron2-DDC":{ - "model_file": "1qyxrrCyoXUvBG2lqVd0KqAlHj-2nZCgS", - "config_file": "1yECKeP2LI7tNv4E8yVNx1yLmCfTCpkqG", - "stats_file": "13st0CZ743v6Br5R5Qw_lH1OPQOr3M-Jv", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.9/tts_models--fr--mai--tacotron2-DDC.zip", + "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", + "commit": "", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact":"egolge@coqui.com" + } + } + }, + "zh-CN":{ + "baker":{ + "tacotron2-DDC-GST":{ + "model_file": "1SYpv7V__QYDjKXa_vJmNXo1CSkcoZovy", + "config_file": "14BIvfJXnFHi3jcxYNX40__TR6RwJOZqi", + "stats_file": "1ECRlXybT6rAWp269CkhjUPwcZ10CkcqD", "commit": "" } } + }, + "nl":{ + "mai":{ + "tacotron2-DDC":{ + "model_file": "1cJWpKflkvchVidn0oyUZdv9CPDOUMi_Q", + "config_file": "1SY61VgvMNafrb76BgxP-ulsG8rhTE1we", + "author": "@r-dh", + "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan", + "stats_file": null, + "commit": "540d811" + } + } } }, "vocoder_models":{ "universal":{ "libri-tts":{ "wavegrad":{ - "model_file": "1r2g90JaZsfCj9dJkI9ioIU6JCFMPRqi6", - "config_file": "1POrrLf5YEpZyjvWyMccj1nGCVc94mR6s", - "stats_file": "1Vwbv4t-N1i3jXqI0bgKAhShAEO097sK0", - "commit": "ea976b0" + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.9/vocoder_models--universal--libri-tts--wavegrad.zip", + "commit": "ea976b0", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact":"egolge@coqui.com" }, "fullband-melgan":{ - "model_file": "1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K", - "config_file": "1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu", - "stats_file": "11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU", - "commit": "4132240" + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.9/vocoder_models--universal--libri-tts--fullband-melgan.zip", + "commit": "4132240", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact":"egolge@coqui.com" } } }, "en": { + "ek1":{ + "wavegrad": { + "description": "EK1 en-rp wavegrad by NMStoker", + "model_file": "1ShaCSrQfSRjM66vo45Bgo019uJDDloLS", + "config_file": "1otnQR5yTfN5A77yMKmUSzwh_VNvYwKai", + "stats_file": null, + "commit": "c802255" + } + }, "ljspeech":{ - "mulitband-melgan":{ - "model_file": "1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K", - "config_file": "1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu", - "stats_file": "11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU", - "commit": "ea976b0" + "multiband-melgan":{ + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.9/vocoder_models--en--ljspeech--mulitband-melgan.zip", + "commit": "ea976b0", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact":"egolge@coqui.com" + } + } + }, + "nl":{ + "mai":{ + "parallel-wavegan":{ + "model_file": "1zYFHElvYW_oTeilvbZVLMLscColWRbck", + "config_file": "1ye9kVDbatAKMncRMui7watrLQ_5DaJ3e", + "author": "@r-dh", + "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", + "stats_file": "1QD40bU_M7CWrj9k0MEACNBRqwqVTSLDc", + "commit": "unknown" } } } } -} \ No newline at end of file +} diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index fc02144aa7..53246e07fe 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -16,6 +16,7 @@ if __name__ == '__main__': + # pylint: disable=bad-continuation parser = argparse.ArgumentParser( description='''Extract attention masks from trained Tacotron/Tacotron2 models. These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n''' diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index 7642f86bf0..a74fe90aef 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -19,8 +19,8 @@ def main(): description="Compute mean and variance of spectrogtram features.") parser.add_argument("--config_path", type=str, required=True, help="TTS config file path to define audio processin parameters.") - parser.add_argument("--out_path", default=None, type=str, - help="directory to save the output file.") + parser.add_argument("--out_path", type=str, required=True, + help="save path (directory and filename).") args = parser.parse_args() # load config diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py new file mode 100644 index 0000000000..f9b6827b31 --- /dev/null +++ b/TTS/bin/find_unique_chars.py @@ -0,0 +1,47 @@ +"""Find all the unique characters in a dataset""" +import os +import argparse +from argparse import RawTextHelpFormatter + +from TTS.tts.datasets.preprocess import get_preprocessor_by_name + + +def main(): + # pylint: disable=bad-continuation + parser = argparse.ArgumentParser(description='''Find all the unique characters or phonemes in a dataset.\n\n''' + + '''Target dataset must be defined in TTS.tts.datasets.preprocess\n\n'''\ + ''' + Example runs: + + python TTS/bin/find_unique_chars.py --dataset ljspeech --meta_file /path/to/LJSpeech/metadata.csv + ''', formatter_class=RawTextHelpFormatter) + + parser.add_argument( + '--dataset', + type=str, + default='', + help='One of the target dataset names in TTS.tts.datasets.preprocess.' + ) + + parser.add_argument( + '--meta_file', + type=str, + default=None, + help='Path to the transcriptions file of the dataset.' + ) + + args = parser.parse_args() + + preprocessor = get_preprocessor_by_name(args.dataset) + items = preprocessor(os.path.dirname(args.meta_file), os.path.basename(args.meta_file)) + texts = "".join(item[0] for item in items) + chars = set(texts) + lower_chars = filter(lambda c: c.islower(), chars) + print(f" > Number of unique characters: {len(chars)}") + print(f" > Unique characters: {''.join(sorted(chars))}") + print(f" > Unique lower characters: {''.join(sorted(lower_chars))}") + + +if __name__ == "__main__": + main() diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index b7ccf850e7..5945b2ada1 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -27,7 +27,9 @@ def main(): # pylint: disable=bad-continuation parser = argparse.ArgumentParser(description='''Synthesize speech on command line.\n\n''' - '''You can either use your trained model or choose a model from the provided list.\n'''\ + '''You can either use your trained model or choose a model from the provided list.\n\n'''\ + + '''If you don't specify any models, then it uses LJSpeech based English models\n\n'''\ ''' Example runs: @@ -35,7 +37,13 @@ def main(): # list provided models ./TTS/bin/synthesize.py --list_models - # run a model from the list + # run tts with default models. + ./TTS/bin synthesize.py --text "Text for TTS" + + # run a tts model with its default vocoder model. + ./TTS/bin synthesize.py --text "Text for TTS" --model_name "//" + + # run with specific tts and vocoder models from the list ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "//" --vocoder_name "//" --output_path # run your own TTS model (Using Griffin-Lim Vocoder) @@ -67,7 +75,7 @@ def main(): parser.add_argument( '--model_name', type=str, - default=None, + default="tts_models/en/ljspeech/speedy-speech-wn", help= 'Name of one of the pre-trained tts models in format //' ) @@ -142,6 +150,10 @@ def main(): args = parser.parse_args() + # print the description if either text or list_models is not set + if args.text is None and not args.list_models: + parser.parse_args(['-h']) + # load model manager path = Path(__file__).parent / "../.models.json" manager = ModelManager(path) @@ -158,10 +170,11 @@ def main(): # CASE2: load pre-trained models if args.model_name is not None: - model_path, config_path = manager.download_model(args.model_name) + model_path, config_path, model_item = manager.download_model(args.model_name) + args.vocoder_name = model_item['default_vocoder'] if args.vocoder_name is None else args.vocoder_name if args.vocoder_name is not None: - vocoder_path, vocoder_config_path = manager.download_model(args.vocoder_name) + vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) # CASE3: load custome models if args.model_path is not None: @@ -176,7 +189,6 @@ def main(): # load models synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, args.use_cuda) - use_griffin_lim = vocoder_path is None print(" > Text: {}".format(args.text)) # # handle multi-speaker setting @@ -211,8 +223,8 @@ def main(): str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' out_path = os.path.join(args.out_path, file_name) print(" > Saving output to {}".format(out_path)) - synthesizer.save_wav(wav, out_path) + synthesizer.save_wav(wav, out_path,) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 5201f548b0..12fba6e11c 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -34,7 +34,9 @@ print(" > Number of GPUs: ", num_gpus) -def setup_loader(ap: AudioProcessor, is_val: bool=False, verbose: bool=False): +def setup_loader(ap: AudioProcessor, + is_val: bool = False, + verbose: bool = False): if is_val: loader = None else: @@ -254,8 +256,7 @@ def main(args): # pylint: disable=redefined-outer-name if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, OUT_PATH, - new_fields) + copy_model_files(c, args.config_path, OUT_PATH, new_fields) LOG_DIR = OUT_PATH tb_logger = TensorboardLogger(LOG_DIR, model_name='Speaker_Encoder') diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index d03ab1eec4..23695f706f 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -1,8 +1,6 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- +"""Train Glow TTS model.""" -import argparse -import glob import os import sys import time @@ -14,10 +12,12 @@ from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler + +from TTS.utils.arguments import parse_arguments, process_args from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import MyDataset from TTS.tts.layers.losses import GlowTTSLoss -from TTS.tts.utils.generic_utils import check_config_tts, setup_model +from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import parse_speakers @@ -25,18 +25,15 @@ from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -from TTS.utils.console_logger import ConsoleLogger from TTS.utils.distribute import init_distributed, reduce_tensor from TTS.utils.generic_utils import (KeepAverage, count_parameters, - create_experiment_folder, get_git_branch, remove_experiment_folder, set_init_dict) -from TTS.utils.io import copy_model_files, load_config from TTS.utils.radam import RAdam -from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import NoamLR, setup_torch_training_env use_cuda, num_gpus = setup_torch_training_env(True, False) + def setup_loader(ap, r, is_val=False, verbose=False): if is_val and not c.run_eval: loader = None @@ -119,7 +116,7 @@ def format_data(data): avg_text_length, avg_spec_length, attn_mask, item_idx -def data_depended_init(data_loader, model, ap): +def data_depended_init(data_loader, model): """Data depended initialization for activation normalization.""" if hasattr(model, 'module'): for f in model.module.decoder.flows: @@ -138,7 +135,7 @@ def data_depended_init(data_loader, model, ap): # format data text_input, text_lengths, mel_input, mel_lengths, spekaer_embed,\ - _, _, attn_mask, item_idx = format_data(data) + _, _, attn_mask, _ = format_data(data) # forward pass model _ = model.forward( @@ -177,7 +174,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, # format data text_input, text_lengths, mel_input, mel_lengths, speaker_c,\ - avg_text_length, avg_spec_length, attn_mask, item_idx = format_data(data) + avg_text_length, avg_spec_length, attn_mask, _ = format_data(data) loader_time = time.time() - end_time @@ -191,20 +188,20 @@ def train(data_loader, model, criterion, optimizer, scheduler, # compute loss loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, - o_dur_log, o_total_dur, text_lengths) + o_dur_log, o_total_dur, text_lengths) # backward pass with loss scaling if c.mixed_precision: scaler.scale(loss_dict['loss']).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) scaler.step(optimizer) scaler.update() else: loss_dict['loss'].backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) optimizer.step() # setup lr @@ -271,7 +268,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint(model, optimizer, global_step, epoch, 1, OUT_PATH, + save_checkpoint(model, optimizer, global_step, epoch, 1, OUT_PATH, model_characters, model_loss=loss_dict['loss']) # wait all kernels to be completed @@ -332,7 +329,7 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch): # format data text_input, text_lengths, mel_input, mel_lengths, speaker_c,\ - _, _, attn_mask, item_idx = format_data(data) + _, _, attn_mask, _ = format_data(data) # forward pass model z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( @@ -468,10 +465,9 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch): return keep_avg.avg_values -# FIXME: move args definition/parsing inside of main? def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, symbols, phonemes, speaker_mapping + global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping # Audio processor ap = AudioProcessor(**c.audio) if 'characters' in c.keys(): @@ -481,7 +477,10 @@ def main(args): # pylint: disable=redefined-outer-name if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) - num_chars = len(phonemes) if c.use_phonemes else len(symbols) + + # set model characters + model_characters = phonemes if c.use_phonemes else symbols + num_chars = len(model_characters) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) @@ -501,6 +500,7 @@ def main(args): # pylint: disable=redefined-outer-name criterion = GlowTTSLoss() if args.restore_path: + print(f" > Restoring from {os.path.basename(args.restore_path)} ...") checkpoint = torch.load(args.restore_path, map_location='cpu') try: # TODO: fix optimizer init, model.cuda() needs to be called before @@ -518,7 +518,7 @@ def main(args): # pylint: disable=redefined-outer-name for group in optimizer.param_groups: group['initial_lr'] = c.lr - print(" > Model restored from step %d" % checkpoint['step'], + print(f" > Model restored from step {checkpoint['step']:d}", flush=True) args.restore_step = checkpoint['step'] else: @@ -542,106 +542,45 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) - if 'best_loss' not in locals(): + if args.restore_step == 0 or not args.best_path: best_loss = float('inf') + print(" > Starting with inf best loss.") + else: + print(" > Restoring best loss from " + f"{os.path.basename(args.best_path)} ...") + best_loss = torch.load(args.best_path, + map_location='cpu')['model_loss'] + print(f" > Starting with loaded last best loss {best_loss}.") + keep_all_best = c.get('keep_all_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_all_best False # define dataloaders train_loader = setup_loader(ap, 1, is_val=False, verbose=True) eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) global_step = args.restore_step - model = data_depended_init(train_loader, model, ap) + model = data_depended_init(train_loader, model) for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) - train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, + train_avg_loss_dict, global_step = train(train_loader, model, + criterion, optimizer, scheduler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch) + eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, + global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_loss'] - best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, - OUT_PATH) + best_loss = save_best_model(target_loss, best_loss, model, optimizer, + global_step, epoch, c.r, OUT_PATH, model_characters, + keep_all_best=keep_all_best, keep_after=keep_after) if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--continue_path', - type=str, - help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) - parser.add_argument( - '--restore_path', - type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument( - '--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv - ) - parser.add_argument('--debug', - type=bool, - default=False, - help='Do not verify commit integrity to run training.') - - # DISTRUBUTED - parser.add_argument( - '--rank', - type=int, - default=0, - help='DISTRIBUTED: process rank for distributed training.') - parser.add_argument('--group_id', - type=str, - default="", - help='DISTRIBUTED: process group id.') - args = parser.parse_args() - - if args.continue_path != '': - args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, 'config.json') - list_of_files = glob.glob(args.continue_path + "/*.pth.tar") # * means all if need specific format then *.csv - latest_model_file = max(list_of_files, key=os.path.getctime) - args.restore_path = latest_model_file - print(f" > Training continues for {args.restore_path}") - - # setup output paths and read configs - c = load_config(args.config_path) - # check_config(c) - check_config_tts(c) - _ = os.path.dirname(os.path.realpath(__file__)) - - if c.mixed_precision: - print(" > Mixed precision enabled.") - - OUT_PATH = args.continue_path - if args.continue_path == '': - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug) - - AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') - - c_logger = ConsoleLogger() - - if args.rank == 0: - os.makedirs(AUDIO_PATH, exist_ok=True) - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, - OUT_PATH, new_fields) - os.chmod(AUDIO_PATH, 0o775) - os.chmod(OUT_PATH, 0o775) - - LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name='TTS') - - # write model desc to tensorboard - tb_logger.tb_add_text('model-description', c['run_description'], 0) + args = parse_arguments(sys.argv) + c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args( + args, model_type='glow_tts') try: main(args) diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index a24cf8bcd4..a2ac6028de 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -1,8 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import argparse -import glob import os import sys import time @@ -11,6 +9,7 @@ from random import randrange import torch +from TTS.utils.arguments import parse_arguments, process_args # DISTRIBUTED from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader @@ -18,7 +17,7 @@ from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import MyDataset from TTS.tts.layers.losses import SpeedySpeechLoss -from TTS.tts.utils.generic_utils import check_config_tts, setup_model +from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import parse_speakers @@ -26,14 +25,10 @@ from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -from TTS.utils.console_logger import ConsoleLogger from TTS.utils.distribute import init_distributed, reduce_tensor from TTS.utils.generic_utils import (KeepAverage, count_parameters, - create_experiment_folder, get_git_branch, remove_experiment_folder, set_init_dict) -from TTS.utils.io import copy_model_files, load_config from TTS.utils.radam import RAdam -from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import NoamLR, setup_torch_training_env use_cuda, num_gpus = setup_torch_training_env(True, False) @@ -175,13 +170,13 @@ def train(data_loader, model, criterion, optimizer, scheduler, scaler.scale(loss_dict['loss']).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) scaler.step(optimizer) scaler.update() else: loss_dict['loss'].backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) optimizer.step() # setup lr @@ -250,7 +245,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint(model, optimizer, global_step, epoch, 1, OUT_PATH, + save_checkpoint(model, optimizer, global_step, epoch, 1, OUT_PATH, model_characters, model_loss=loss_dict['loss']) # wait all kernels to be completed @@ -434,7 +429,7 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch): # FIXME: move args definition/parsing inside of main? def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, symbols, phonemes, speaker_mapping + global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping # Audio processor ap = AudioProcessor(**c.audio) if 'characters' in c.keys(): @@ -444,7 +439,10 @@ def main(args): # pylint: disable=redefined-outer-name if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) - num_chars = len(phonemes) if c.use_phonemes else len(symbols) + + # set model characters + model_characters = phonemes if c.use_phonemes else symbols + num_chars = len(model_characters) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=True) @@ -464,6 +462,7 @@ def main(args): # pylint: disable=redefined-outer-name criterion = SpeedySpeechLoss(c) if args.restore_path: + print(f" > Restoring from {os.path.basename(args.restore_path)} ...") checkpoint = torch.load(args.restore_path, map_location='cpu') try: # TODO: fix optimizer init, model.cuda() needs to be called before @@ -505,8 +504,17 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) - if 'best_loss' not in locals(): + if args.restore_step == 0 or not args.best_path: best_loss = float('inf') + print(" > Starting with inf best loss.") + else: + print(" > Restoring best loss from " + f"{os.path.basename(args.best_path)} ...") + best_loss = torch.load(args.best_path, + map_location='cpu')['model_loss'] + print(f" > Starting with loaded last best loss {best_loss}.") + keep_all_best = c.get('keep_all_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_all_best False # define dataloaders train_loader = setup_loader(ap, 1, is_val=False, verbose=True) @@ -518,91 +526,21 @@ def main(args): # pylint: disable=redefined-outer-name train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch) + eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, + global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_loss'] - best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, - OUT_PATH) + best_loss = save_best_model(target_loss, best_loss, model, optimizer, + global_step, epoch, c.r, OUT_PATH, model_characters, + keep_all_best=keep_all_best, keep_after=keep_after) if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--continue_path', - type=str, - help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) - parser.add_argument( - '--restore_path', - type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument( - '--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv - ) - parser.add_argument('--debug', - type=bool, - default=False, - help='Do not verify commit integrity to run training.') - - # DISTRUBUTED - parser.add_argument( - '--rank', - type=int, - default=0, - help='DISTRIBUTED: process rank for distributed training.') - parser.add_argument('--group_id', - type=str, - default="", - help='DISTRIBUTED: process group id.') - args = parser.parse_args() - - if args.continue_path != '': - args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, 'config.json') - list_of_files = glob.glob(args.continue_path + "/*.pth.tar") # * means all if need specific format then *.csv - latest_model_file = max(list_of_files, key=os.path.getctime) - args.restore_path = latest_model_file - print(f" > Training continues for {args.restore_path}") - - # setup output paths and read configs - c = load_config(args.config_path) - # check_config(c) - check_config_tts(c) - _ = os.path.dirname(os.path.realpath(__file__)) - - if c.mixed_precision: - print(" > Mixed precision enabled.") - - OUT_PATH = args.continue_path - if args.continue_path == '': - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug) - - AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') - - c_logger = ConsoleLogger() - - if args.rank == 0: - os.makedirs(AUDIO_PATH, exist_ok=True) - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, OUT_PATH, new_fields) - os.chmod(AUDIO_PATH, 0o775) - os.chmod(OUT_PATH, 0o775) - - LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name='TTS') - - # write model desc to tensorboard - tb_logger.tb_add_text('model-description', c['run_description'], 0) + args = parse_arguments(sys.argv) + c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args( + args, model_type='tts') try: main(args) diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index ccb35a7c40..0887c2ccf9 100644 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -1,8 +1,6 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- +"""Trains Tacotron based TTS models.""" -import argparse -import glob import os import sys import time @@ -12,10 +10,11 @@ import numpy as np import torch from torch.utils.data import DataLoader +from TTS.utils.arguments import parse_arguments, process_args from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import MyDataset from TTS.tts.layers.losses import TacotronLoss -from TTS.tts.utils.generic_utils import check_config_tts, setup_model +from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import parse_speakers @@ -23,15 +22,11 @@ from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -from TTS.utils.console_logger import ConsoleLogger from TTS.utils.distribute import (DistributedSampler, apply_gradient_allreduce, init_distributed, reduce_tensor) from TTS.utils.generic_utils import (KeepAverage, count_parameters, - create_experiment_folder, get_git_branch, remove_experiment_folder, set_init_dict) -from TTS.utils.io import copy_model_files, load_config from TTS.utils.radam import RAdam -from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import (NoamLR, adam_weight_decay, check_update, gradual_training_scheduler, set_weight_decay, setup_torch_training_env) @@ -61,7 +56,11 @@ def setup_loader(ap, r, is_val=False, verbose=False, dataset=None): phoneme_language=c.phoneme_language, enable_eos_bos=c.enable_eos_bos_chars, verbose=verbose, - speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None) + speaker_mapping=(speaker_mapping if ( + c.use_speaker_embedding + and c.use_external_speaker_embedding_file + ) else None) + ) if c.use_phonemes and c.compute_input_seq_cache: # precompute phonemes to have a better estimate of sequence lengths. @@ -178,10 +177,10 @@ def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, # compute loss loss_dict = criterion(postnet_output, decoder_output, mel_input, - linear_input, stop_tokens, stop_targets, - mel_lengths, decoder_backward_output, - alignments, alignment_lengths, alignments_backward, - text_lengths) + linear_input, stop_tokens, stop_targets, + mel_lengths, decoder_backward_output, + alignments, alignment_lengths, + alignments_backward, text_lengths) # check nan loss if torch.isnan(loss_dict['loss']).any(): @@ -199,7 +198,7 @@ def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, # stopnet optimizer step if c.separate_stopnet: - scaler_st.scale( loss_dict['stopnet_loss']).backward() + scaler_st.scale(loss_dict['stopnet_loss']).backward() scaler.unscale_(optimizer_st) optimizer_st, _ = adam_weight_decay(optimizer_st) grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) @@ -285,6 +284,7 @@ def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, save_checkpoint(model, optimizer, global_step, epoch, model.decoder.r, OUT_PATH, optimizer_st=optimizer_st, model_loss=loss_dict['postnet_loss'], + characters=model_characters, scaler=scaler.state_dict() if c.mixed_precision else None) # Diagnostic visualizations @@ -491,12 +491,13 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch): return keep_avg.avg_values -# FIXME: move args definition/parsing inside of main? def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, symbols, phonemes, speaker_mapping + global meta_data_train, meta_data_eval, speaker_mapping, symbols, phonemes, model_characters # Audio processor ap = AudioProcessor(**c.audio) + + # setup custom characters if set in config file. if 'characters' in c.keys(): symbols, phonemes = make_symbols(**c.characters) @@ -505,6 +506,7 @@ def main(args): # pylint: disable=redefined-outer-name init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) + model_characters = phonemes if c.use_phonemes else symbols # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) @@ -534,15 +536,15 @@ def main(args): # pylint: disable=redefined-outer-name optimizer_st = None # setup criterion - criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) - + criterion = TacotronLoss(c, stopnet_pos_weight=c.stopnet_pos_weight, ga_sigma=0.4) if args.restore_path: + print(f" > Restoring from {os.path.basename(args.restore_path)}...") checkpoint = torch.load(args.restore_path, map_location='cpu') try: - print(" > Restoring Model.") + print(" > Restoring Model...") model.load_state_dict(checkpoint['model']) # optimizer restore - print(" > Restoring Optimizer.") + print(" > Restoring Optimizer...") optimizer.load_state_dict(checkpoint['optimizer']) if "scaler" in checkpoint and c.mixed_precision: print(" > Restoring AMP Scaler...") @@ -550,7 +552,7 @@ def main(args): # pylint: disable=redefined-outer-name if c.reinit_layers: raise RuntimeError except (KeyError, RuntimeError): - print(" > Partial model initialization.") + print(" > Partial model initialization...") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint['model'], c) # torch.save(model_dict, os.path.join(OUT_PATH, 'state_dict.pt')) @@ -584,8 +586,17 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) - if 'best_loss' not in locals(): + if args.restore_step == 0 or not args.best_path: best_loss = float('inf') + print(" > Starting with inf best loss.") + else: + print(" > Restoring best loss from " + f"{os.path.basename(args.best_path)} ...") + best_loss = torch.load(args.best_path, + map_location='cpu')['model_loss'] + print(f" > Starting with loaded last best loss {best_loss}.") + keep_all_best = c.get('keep_all_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_all_best False # define data loaders train_loader = setup_loader(ap, @@ -637,85 +648,17 @@ def main(args): # pylint: disable=redefined-outer-name epoch, c.r, OUT_PATH, - scaler=scaler.state_dict() if c.mixed_precision else None) + model_characters, + keep_all_best=keep_all_best, + keep_after=keep_after, + scaler=scaler.state_dict() if c.mixed_precision else None + ) if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--continue_path', - type=str, - help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) - parser.add_argument( - '--restore_path', - type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument( - '--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv - ) - parser.add_argument('--debug', - type=bool, - default=False, - help='Do not verify commit integrity to run training.') - - # DISTRUBUTED - parser.add_argument( - '--rank', - type=int, - default=0, - help='DISTRIBUTED: process rank for distributed training.') - parser.add_argument('--group_id', - type=str, - default="", - help='DISTRIBUTED: process group id.') - args = parser.parse_args() - - if args.continue_path != '': - print(f" > Training continues for {args.continue_path}") - args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, 'config.json') - list_of_files = glob.glob(args.continue_path + "/*.pth.tar") # * means all if need specific format then *.csv - latest_model_file = max(list_of_files, key=os.path.getctime) - args.restore_path = latest_model_file - - # setup output paths and read configs - c = load_config(args.config_path) - check_config_tts(c) - _ = os.path.dirname(os.path.realpath(__file__)) - - if c.mixed_precision: - print(" > Mixed precision mode is ON") - - OUT_PATH = args.continue_path - if args.continue_path == '': - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug) - - AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') - - c_logger = ConsoleLogger() - - if args.rank == 0: - os.makedirs(AUDIO_PATH, exist_ok=True) - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, - OUT_PATH, new_fields) - os.chmod(AUDIO_PATH, 0o775) - os.chmod(OUT_PATH, 0o775) - - LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name='TTS') - - # write model desc to tensorboard - tb_logger.tb_add_text('model-description', c['run_description'], 0) + args = parse_arguments(sys.argv) + c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args( + args, model_type='tacotron') try: main(args) diff --git a/TTS/bin/train_vocoder_gan.py b/TTS/bin/train_vocoder_gan.py index 5f1e8c636e..a4872361b7 100644 --- a/TTS/bin/train_vocoder_gan.py +++ b/TTS/bin/train_vocoder_gan.py @@ -1,5 +1,6 @@ -import argparse -import glob +#!/usr/bin/env python3 +"""Trains GAN based vocoder model.""" + import os import sys import time @@ -8,14 +9,13 @@ import torch from torch.utils.data import DataLoader +from TTS.utils.arguments import parse_arguments, process_args from TTS.utils.audio import AudioProcessor -from TTS.utils.console_logger import ConsoleLogger from TTS.utils.generic_utils import (KeepAverage, count_parameters, - create_experiment_folder, get_git_branch, remove_experiment_folder, set_init_dict) -from TTS.utils.io import copy_model_files, load_config + from TTS.utils.radam import RAdam -from TTS.utils.tensorboard_logger import TensorboardLogger + from TTS.utils.training import setup_torch_training_env from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data @@ -33,9 +33,8 @@ def setup_loader(ap, is_val=False, verbose=False): - if is_val and not c.run_eval: - loader = None - else: + loader = None + if not is_val or c.run_eval: dataset = GANDataset(ap=ap, items=eval_data if is_val else train_data, seq_len=c.seq_len, @@ -51,7 +50,7 @@ def setup_loader(ap, is_val=False, verbose=False): sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None loader = DataLoader(dataset, batch_size=1 if is_val else c.batch_size, - shuffle=False if num_gpus > 1 else True, + shuffle=num_gpus == 0, drop_last=False, sampler=sampler, num_workers=c.num_val_loader_workers @@ -274,14 +273,14 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, # compute spectrograms figures = plot_results(y_hat_vis, y_G, ap, global_step, - 'train') + 'train') tb_logger.tb_train_figures(global_step, figures) # Sample audio sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() tb_logger.tb_train_audios(global_step, - {'train/audio': sample_voice}, - c.audio["sample_rate"]) + {'train/audio': sample_voice}, + c.audio["sample_rate"]) end_time = time.time() # print epoch stats @@ -430,17 +429,16 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) # Sample audio sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice}, - c.audio["sample_rate"]) + c.audio["sample_rate"]) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - # synthesize a full voice + # synthesize a full voice data_loader.return_segments = False return keep_avg.avg_values -# FIXME: move args definition/parsing inside of main? def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global train_data, eval_data @@ -487,6 +485,7 @@ def main(args): # pylint: disable=redefined-outer-name criterion_disc = DiscriminatorLoss(c) if args.restore_path: + print(f" > Restoring from {os.path.basename(args.restore_path)}...") checkpoint = torch.load(args.restore_path, map_location='cpu') try: print(" > Restoring Generator Model...") @@ -507,7 +506,7 @@ def main(args): # pylint: disable=redefined-outer-name scheduler_disc.load_state_dict(checkpoint['scheduler_disc']) scheduler_disc.optimizer = optimizer_disc except RuntimeError: - # retore only matching layers. + # restore only matching layers. print(" > Partial model initialization...") model_dict = model_gen.state_dict() model_dict = set_init_dict(model_dict, checkpoint['model'], c) @@ -525,7 +524,7 @@ def main(args): # pylint: disable=redefined-outer-name for group in optimizer_disc.param_groups: group['lr'] = c.lr_disc - print(" > Model restored from step %d" % checkpoint['step'], + print(f" > Model restored from step {checkpoint['step']:d}", flush=True) args.restore_step = checkpoint['step'] else: @@ -547,8 +546,17 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model_disc) print(" > Discriminator has {} parameters".format(num_params), flush=True) - if 'best_loss' not in locals(): + if args.restore_step == 0 or not args.best_path: best_loss = float('inf') + print(" > Starting with inf best loss.") + else: + print(" > Restoring best loss from " + f"{os.path.basename(args.best_path)} ...") + best_loss = torch.load(args.best_path, + map_location='cpu')['model_loss'] + print(f" > Starting with best loss of {best_loss}.") + keep_all_best = c.get('keep_all_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_all_best False global_step = args.restore_step for epoch in range(0, c.epochs): @@ -557,7 +565,8 @@ def main(args): # pylint: disable=redefined-outer-name model_disc, criterion_disc, optimizer_disc, scheduler_gen, scheduler_disc, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap, + eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, + criterion_disc, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict[c.target_loss] @@ -572,83 +581,16 @@ def main(args): # pylint: disable=redefined-outer-name global_step, epoch, OUT_PATH, - model_losses=eval_avg_loss_dict) + keep_all_best=keep_all_best, + keep_after=keep_after, + model_losses=eval_avg_loss_dict, + ) if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--continue_path', - type=str, - help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) - parser.add_argument( - '--restore_path', - type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument('--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv) - parser.add_argument('--debug', - type=bool, - default=False, - help='Do not verify commit integrity to run training.') - - # DISTRUBUTED - parser.add_argument( - '--rank', - type=int, - default=0, - help='DISTRIBUTED: process rank for distributed training.') - parser.add_argument('--group_id', - type=str, - default="", - help='DISTRIBUTED: process group id.') - args = parser.parse_args() - - if args.continue_path != '': - args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, 'config.json') - list_of_files = glob.glob( - args.continue_path + - "/*.pth.tar") # * means all if need specific format then *.csv - latest_model_file = max(list_of_files, key=os.path.getctime) - args.restore_path = latest_model_file - print(f" > Training continues for {args.restore_path}") - - # setup output paths and read configs - c = load_config(args.config_path) - # check_config(c) - _ = os.path.dirname(os.path.realpath(__file__)) - - OUT_PATH = args.continue_path - if args.continue_path == '': - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, - args.debug) - - AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') - - c_logger = ConsoleLogger() - - if args.rank == 0: - os.makedirs(AUDIO_PATH, exist_ok=True) - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, - OUT_PATH, new_fields) - os.chmod(AUDIO_PATH, 0o775) - os.chmod(OUT_PATH, 0o775) - - LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER') - - # write model desc to tensorboard - tb_logger.tb_add_text('model-description', c['run_description'], 0) + args = parse_arguments(sys.argv) + c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args( + args, model_type='gan') try: main(args) diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py index 73802c63f1..51a3150938 100644 --- a/TTS/bin/train_vocoder_wavegrad.py +++ b/TTS/bin/train_vocoder_wavegrad.py @@ -1,5 +1,6 @@ -import argparse -import glob +#!/usr/bin/env python3 +"""Trains WaveGrad vocoder models.""" + import os import sys import time @@ -12,14 +13,11 @@ from torch.optim import Adam from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler +from TTS.utils.arguments import parse_arguments, process_args from TTS.utils.audio import AudioProcessor -from TTS.utils.console_logger import ConsoleLogger from TTS.utils.distribute import init_distributed from TTS.utils.generic_utils import (KeepAverage, count_parameters, - create_experiment_folder, get_git_branch, remove_experiment_folder, set_init_dict) -from TTS.utils.io import copy_model_files, load_config -from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import setup_torch_training_env from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset @@ -34,16 +32,16 @@ def setup_loader(ap, is_val=False, verbose=False): loader = None else: dataset = WaveGradDataset(ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - is_training=not is_val, - return_segments=True, - use_noise_augment=False, - use_cache=c.use_cache, - verbose=verbose) + items=eval_data if is_val else train_data, + seq_len=c.seq_len, + hop_len=ap.hop_length, + pad_short=c.pad_short, + conv_pad=c.conv_pad, + is_training=not is_val, + return_segments=True, + use_noise_augment=False, + use_cache=c.use_cache, + verbose=verbose) sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader(dataset, batch_size=c.batch_size, @@ -79,8 +77,8 @@ def format_test_data(data): return m, x -def train(model, criterion, optimizer, - scheduler, scaler, ap, global_step, epoch): +def train(model, criterion, optimizer, scheduler, scaler, ap, global_step, + epoch): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) model.train() epoch_time = 0 @@ -94,7 +92,8 @@ def train(model, criterion, optimizer, c_logger.print_train_start() # setup noise schedule noise_schedule = c['train_noise_schedule'] - betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], noise_schedule['num_steps']) + betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], + noise_schedule['num_steps']) if hasattr(model, 'module'): model.module.compute_noise_level(betas) else: @@ -120,7 +119,7 @@ def train(model, criterion, optimizer, # compute losses loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {'wavegrad_loss':loss} + loss_wavegrad_dict = {'wavegrad_loss': loss} # check nan loss if torch.isnan(loss).any(): @@ -133,13 +132,13 @@ def train(model, criterion, optimizer, scaler.scale(loss).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.clip_grad) + c.clip_grad) scaler.step(optimizer) scaler.update() else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.clip_grad) + c.clip_grad) optimizer.step() # schedule update @@ -195,17 +194,19 @@ def train(model, criterion, optimizer, if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint(model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None) + save_checkpoint( + model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict, + scaler=scaler.state_dict() if c.mixed_precision else None + ) end_time = time.time() @@ -252,7 +253,7 @@ def evaluate(model, criterion, ap, global_step, epoch): # compute losses loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {'wavegrad_loss':loss} + loss_wavegrad_dict = {'wavegrad_loss': loss} loss_dict = dict() @@ -284,7 +285,9 @@ def evaluate(model, criterion, ap, global_step, epoch): # setup noise schedule and inference noise_schedule = c['test_noise_schedule'] - betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], noise_schedule['num_steps']) + betas = np.linspace(noise_schedule['min_val'], + noise_schedule['max_val'], + noise_schedule['num_steps']) if hasattr(model, 'module'): model.module.compute_noise_level(betas) # compute voice @@ -315,7 +318,8 @@ def main(args): # pylint: disable=redefined-outer-name print(f" > Loading wavs from: {c.data_path}") if c.feature_path is not None: print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) + eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, + c.eval_split_size) else: eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) @@ -345,7 +349,12 @@ def main(args): # pylint: disable=redefined-outer-name # setup criterion criterion = torch.nn.L1Loss().cuda() + if use_cuda: + model.cuda() + criterion.cuda() + if args.restore_path: + print(f" > Restoring from {os.path.basename(args.restore_path)}...") checkpoint = torch.load(args.restore_path, map_location='cpu') try: print(" > Restoring Model...") @@ -378,10 +387,6 @@ def main(args): # pylint: disable=redefined-outer-name else: args.restore_step = 0 - if use_cuda: - model.cuda() - criterion.cuda() - # DISTRUBUTED if num_gpus > 1: model = DDP_th(model, device_ids=[args.rank]) @@ -389,113 +394,49 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model) print(" > WaveGrad has {} parameters".format(num_params), flush=True) - if 'best_loss' not in locals(): + if args.restore_step == 0 or not args.best_path: best_loss = float('inf') + print(" > Starting with inf best loss.") + else: + print(" > Restoring best loss from " + f"{os.path.basename(args.best_path)} ...") + best_loss = torch.load(args.best_path, + map_location='cpu')['model_loss'] + print(f" > Starting with loaded last best loss {best_loss}.") + keep_all_best = c.get('keep_all_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_all_best False global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train(model, criterion, optimizer, - scheduler, scaler, ap, global_step, - epoch) - eval_avg_loss_dict = evaluate(model, criterion, ap, - global_step, epoch) + _, global_step = train(model, criterion, optimizer, scheduler, scaler, + ap, global_step, epoch) + eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model(target_loss, - best_loss, - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=eval_avg_loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None) + best_loss = save_best_model( + target_loss, + best_loss, + model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + keep_all_best=keep_all_best, + keep_after=keep_after, + model_losses=eval_avg_loss_dict, + scaler=scaler.state_dict() if c.mixed_precision else None + ) if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--continue_path', - type=str, - help= - 'Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) - parser.add_argument( - '--restore_path', - type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument('--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv) - parser.add_argument('--debug', - type=bool, - default=False, - help='Do not verify commit integrity to run training.') - - # DISTRUBUTED - parser.add_argument( - '--rank', - type=int, - default=0, - help='DISTRIBUTED: process rank for distributed training.') - parser.add_argument('--group_id', - type=str, - default="", - help='DISTRIBUTED: process group id.') - args = parser.parse_args() - - if args.continue_path != '': - args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, 'config.json') - list_of_files = glob.glob( - args.continue_path + - "/*.pth.tar") # * means all if need specific format then *.csv - latest_model_file = max(list_of_files, key=os.path.getctime) - args.restore_path = latest_model_file - print(f" > Training continues for {args.restore_path}") - - # setup output paths and read configs - c = load_config(args.config_path) - # check_config(c) - _ = os.path.dirname(os.path.realpath(__file__)) - - # DISTRIBUTED - if c.mixed_precision: - print(" > Mixed precision is enabled") - - OUT_PATH = args.continue_path - if args.continue_path == '': - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, - args.debug) - - AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') - - c_logger = ConsoleLogger() - - if args.rank == 0: - os.makedirs(AUDIO_PATH, exist_ok=True) - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, - OUT_PATH, new_fields) - os.chmod(AUDIO_PATH, 0o775) - os.chmod(OUT_PATH, 0o775) - - LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER') - - # write model desc to tensorboard - tb_logger.tb_add_text('model-description', c['run_description'], 0) + args = parse_arguments(sys.argv) + c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args( + args, model_type='wavegrad') try: main(args) diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py index cad357dcda..8e9c6a8b9b 100644 --- a/TTS/bin/train_vocoder_wavernn.py +++ b/TTS/bin/train_vocoder_wavernn.py @@ -1,9 +1,10 @@ -import argparse +#!/usr/bin/env python3 +"""Train WaveRNN vocoder model.""" + import os import sys import traceback import time -import glob import random import torch @@ -11,18 +12,14 @@ # from torch.utils.data.distributed import DistributedSampler +from TTS.utils.arguments import parse_arguments, process_args from TTS.tts.utils.visual import plot_spectrogram from TTS.utils.audio import AudioProcessor from TTS.utils.radam import RAdam -from TTS.utils.io import copy_model_files, load_config from TTS.utils.training import setup_torch_training_env -from TTS.utils.console_logger import ConsoleLogger -from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.generic_utils import ( KeepAverage, count_parameters, - create_experiment_folder, - get_git_branch, remove_experiment_folder, set_init_dict, ) @@ -32,7 +29,7 @@ load_wav_feat_data ) from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss -from TTS.vocoder.utils.generic_utils import setup_wavernn +from TTS.vocoder.utils.generic_utils import setup_generator from TTS.vocoder.utils.io import save_best_model, save_checkpoint @@ -181,18 +178,19 @@ def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint(model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None - ) + save_checkpoint( + model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict, + scaler=scaler.state_dict() if c.mixed_precision else None + ) # synthesize a full voice rand_idx = random.randrange(0, len(train_data)) @@ -200,12 +198,14 @@ def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch train_data[rand_idx], (tuple, list)) else train_data[rand_idx][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) - sample_wav = model.generate(ground_mel, - c.batched, - c.target_samples, - c.overlap_samples, - use_cuda - ) + ground_mel = torch.FloatTensor(ground_mel) + if use_cuda: + ground_mel = ground_mel.cuda(non_blocking=True) + sample_wav = model.inference(ground_mel, + c.batched, + c.target_samples, + c.overlap_samples, + ) predict_mel = ap.melspectrogram(sample_wav) # compute spectrograms @@ -287,12 +287,14 @@ def evaluate(model, criterion, ap, global_step, epoch): eval_data[rand_idx], (tuple, list)) else eval_data[rand_idx][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) - sample_wav = model.generate(ground_mel, - c.batched, - c.target_samples, - c.overlap_samples, - use_cuda - ) + ground_mel = torch.FloatTensor(ground_mel) + if use_cuda: + ground_mel = ground_mel.cuda(non_blocking=True) + sample_wav = model.inference(ground_mel, + c.batched, + c.target_samples, + c.overlap_samples, + ) predict_mel = ap.melspectrogram(sample_wav) # Sample audio @@ -302,9 +304,10 @@ def evaluate(model, criterion, ap, global_step, epoch): ) # compute spectrograms - figures = {"eval/ground_truth": plot_spectrogram(ground_mel.T), - "eval/prediction": plot_spectrogram(predict_mel.T) - } + figures = { + "eval/ground_truth": plot_spectrogram(ground_mel.T), + "eval/prediction": plot_spectrogram(predict_mel.T) + } tb_logger.tb_eval_figures(global_step, figures) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) @@ -350,7 +353,7 @@ def main(args): # pylint: disable=redefined-outer-name eval_data, train_data = load_wav_data( c.data_path, c.eval_split_size) # setup model - model_wavernn = setup_wavernn(c) + model_wavernn = setup_generator(c) # setup amp scaler scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None @@ -380,6 +383,7 @@ def main(args): # pylint: disable=redefined-outer-name # restore any checkpoint if args.restore_path: + print(f" > Restoring from {os.path.basename(args.restore_path)}...") checkpoint = torch.load(args.restore_path, map_location="cpu") try: print(" > Restoring Model...") @@ -413,8 +417,17 @@ def main(args): # pylint: disable=redefined-outer-name num_parameters = count_parameters(model_wavernn) print(" > Model has {} parameters".format(num_parameters), flush=True) - if "best_loss" not in locals(): - best_loss = float("inf") + if args.restore_step == 0 or not args.best_path: + best_loss = float('inf') + print(" > Starting with inf best loss.") + else: + print(" > Restoring best loss from " + f"{os.path.basename(args.best_path)} ...") + best_loss = torch.load(args.best_path, + map_location='cpu')['model_loss'] + print(f" > Starting with loaded last best loss {best_loss}.") + keep_all_best = c.get('keep_all_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_all_best False global_step = args.restore_step for epoch in range(0, c.epochs): @@ -437,93 +450,17 @@ def main(args): # pylint: disable=redefined-outer-name global_step, epoch, OUT_PATH, + keep_all_best=keep_all_best, + keep_after=keep_after, model_losses=eval_avg_loss_dict, scaler=scaler.state_dict() if c.mixed_precision else None ) if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--continue_path", - type=str, - help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default="", - required="--config_path" not in sys.argv, - ) - parser.add_argument( - "--restore_path", - type=str, - help="Model file to be restored. Use to finetune a model.", - default="", - ) - parser.add_argument( - "--config_path", - type=str, - help="Path to config file for training.", - required="--continue_path" not in sys.argv, - ) - parser.add_argument( - "--debug", - type=bool, - default=False, - help="Do not verify commit integrity to run training.", - ) - - # DISTRUBUTED - parser.add_argument( - "--rank", - type=int, - default=0, - help="DISTRIBUTED: process rank for distributed training.", - ) - parser.add_argument( - "--group_id", type=str, default="", help="DISTRIBUTED: process group id." - ) - args = parser.parse_args() - - if args.continue_path != "": - args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, "config.json") - list_of_files = glob.glob( - args.continue_path + "/*.pth.tar" - ) # * means all if need specific format then *.csv - latest_model_file = max(list_of_files, key=os.path.getctime) - args.restore_path = latest_model_file - print(f" > Training continues for {args.restore_path}") - - # setup output paths and read configs - c = load_config(args.config_path) - # check_config(c) - _ = os.path.dirname(os.path.realpath(__file__)) - - OUT_PATH = args.continue_path - if args.continue_path == "": - OUT_PATH = create_experiment_folder( - c.output_path, c.run_name, args.debug - ) - - AUDIO_PATH = os.path.join(OUT_PATH, "test_audios") - - c_logger = ConsoleLogger() - - if args.rank == 0: - os.makedirs(AUDIO_PATH, exist_ok=True) - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - copy_model_files( - c, args.config_path, OUT_PATH, new_fields - ) - os.chmod(AUDIO_PATH, 0o775) - os.chmod(OUT_PATH, 0o775) - - LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") - - # write model desc to tensorboard - tb_logger.tb_add_text("model-description", c["run_description"], 0) + args = parse_arguments(sys.argv) + c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args( + args, model_type='wavernn') try: main(args) diff --git a/TTS/bin/tune_wavegrad.py b/TTS/bin/tune_wavegrad.py index 7461282d26..436a276467 100644 --- a/TTS/bin/tune_wavegrad.py +++ b/TTS/bin/tune_wavegrad.py @@ -87,5 +87,3 @@ best_schedule = {'beta': beta} print(f" > Found a better schedule. - MSE: {mse.item()}") np.save(args.output_path, best_schedule) - - diff --git a/TTS/server/README.md b/TTS/server/README.md index a8d8635a3a..51cedc052d 100644 --- a/TTS/server/README.md +++ b/TTS/server/README.md @@ -1,15 +1,13 @@ -## TTS example web-server + -#### Development server: +# :frog: TTS demo server +Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below. -##### Using server.py -If you have the environment set already for TTS, then you can directly call ```server.py```. - -**Note:** After installing TTS as a package you can use ```tts-server``` to call the commands below. +**Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal. Examples runs: @@ -17,15 +15,15 @@ List officially released models. ```python TTS/server/server.py --list_models ``` Run the server with the official models. -```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/mulitband-melgan``` +```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan``` Run the server with the official models on a GPU. -```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/mulitband-melgan --use_cuda True``` +```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True``` Run the server with a custom models. ```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth.tar --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth.tar --vocoder_config /path/to/vocoder/config.json``` -##### Using .whl + -#### Running with nginx/uwsgi: + -#### Creating a server package with an embedded model + diff --git a/TTS/server/server.py b/TTS/server/server.py index 1f7357af9e..05960f88de 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -17,8 +17,8 @@ def convert_boolean(x): parser = argparse.ArgumentParser() parser.add_argument('--list_models', type=convert_boolean, nargs='?', const=True, default=False, help='list available pre-trained tts and vocoder models.') - parser.add_argument('--model_name', type=str, help='name of one of the released tts models.') - parser.add_argument('--vocoder_name', type=str, help='name of one of the released vocoder models.') + parser.add_argument('--model_name', type=str, default="tts_models/en/ljspeech/speedy-speech-wn", help='name of one of the released tts models.') + parser.add_argument('--vocoder_name', type=str, default=None, help='name of one of the released vocoder models.') parser.add_argument('--tts_checkpoint', type=str, help='path to custom tts checkpoint file') parser.add_argument('--tts_config', type=str, help='path to custom tts config.json file') parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model') @@ -30,23 +30,7 @@ def convert_boolean(x): parser.add_argument('--show_details', type=convert_boolean, default=False, help='Generate model detail page.') return parser -synthesizer = None - -embedded_models_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model') - -embedded_tts_folder = os.path.join(embedded_models_folder, 'tts') -tts_checkpoint_file = os.path.join(embedded_tts_folder, 'checkpoint.pth.tar') -tts_config_file = os.path.join(embedded_tts_folder, 'config.json') - -embedded_vocoder_folder = os.path.join(embedded_models_folder, 'vocoder') -vocoder_checkpoint_file = os.path.join(embedded_vocoder_folder, 'checkpoint.pth.tar') -vocoder_config_file = os.path.join(embedded_vocoder_folder, 'config.json') - -# These models are soon to be deprecated -embedded_wavernn_folder = os.path.join(embedded_models_folder, 'wavernn') -wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar') -wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json') - +# parse the args args = create_argparser().parse_args() path = Path(__file__).parent / "../.models.json" @@ -56,12 +40,13 @@ def convert_boolean(x): manager.list_models() sys.exit() -# set models by the released models +# update in-use models to the specified released models. if args.model_name is not None: - tts_checkpoint_file, tts_config_file = manager.download_model(args.model_name) + tts_checkpoint_file, tts_config_file, tts_json_dict = manager.download_model(args.model_name) + args.vocoder_name = tts_json_dict['default_vocoder'] if args.vocoder_name is None else args.vocoder_name if args.vocoder_name is not None: - vocoder_checkpoint_file, vocoder_config_file = manager.download_model(args.vocoder_name) + vocoder_checkpoint_file, vocoder_config_file, vocoder_json_dict = manager.download_model(args.vocoder_name) # If these were not specified in the CLI args, use default values with embedded model files if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file): diff --git a/TTS/server/static/TTS_circle.png b/TTS/server/static/TTS_circle.png deleted file mode 100644 index 34755811aa..0000000000 Binary files a/TTS/server/static/TTS_circle.png and /dev/null differ diff --git a/TTS/server/static/coqui-log-green-TTS.png b/TTS/server/static/coqui-log-green-TTS.png new file mode 100644 index 0000000000..6ad188b8c0 Binary files /dev/null and b/TTS/server/static/coqui-log-green-TTS.png differ diff --git a/TTS/server/templates/index.html b/TTS/server/templates/index.html index 8c3c631de1..635db8447b 100644 --- a/TTS/server/templates/index.html +++ b/TTS/server/templates/index.html @@ -5,8 +5,8 @@ - - + + TTS engine @@ -29,13 +29,13 @@ - Fork me on GitHub + Fork me on GitHub