diff --git a/examples/tutorials/tacotron2_pipeline_tutorial.py b/examples/tutorials/tacotron2_pipeline_tutorial.py index 00687166e9..0117a04e0b 100644 --- a/examples/tutorials/tacotron2_pipeline_tutorial.py +++ b/examples/tutorials/tacotron2_pipeline_tutorial.py @@ -23,13 +23,13 @@ # # 2. Spectrogram generation # -# From the encoded text, a spectrogram is generated. We use ``Tacotron2`` +# From the encoded text, a spectrogram is generated. We use the ``Tacotron2`` # model for this. # # 3. Time-domain conversion # # The last step is converting the spectrogram into the waveform. The -# process to generate speech from spectrogram is also called Vocoder. +# process to generate speech from spectrogram is also called a Vocoder. # In this tutorial, three different vocoders are used, # :py:class:`~torchaudio.models.WaveRNN`, # :py:class:`~torchaudio.transforms.GriffinLim`, and @@ -90,17 +90,13 @@ # works. # # Since the pre-trained Tacotron2 model expects specific set of symbol -# tables, the same functionalities available in ``torchaudio``. This -# section is more for the explanation of the basis of encoding. +# tables, the same functionalities is available in ``torchaudio``. However, +# we will first manually implement the encoding to aid in understanding. # -# Firstly, we define the set of symbols. For example, we can use +# First, we define the set of symbols # ``'_-!\'(),.:;? abcdefghijklmnopqrstuvwxyz'``. Then, we will map the # each character of the input text into the index of the corresponding -# symbol in the table. -# -# The following is an example of such processing. In the example, symbols -# that are not in the table are ignored. -# +# symbol in the table. Symbols that are not in the table are ignored. symbols = "_-!'(),.:;? abcdefghijklmnopqrstuvwxyz" look_up = {s: i for i, s in enumerate(symbols)} @@ -118,8 +114,8 @@ def text_to_sequence(text): ###################################################################### # As mentioned in the above, the symbol table and indices must match -# what the pretrained Tacotron2 model expects. ``torchaudio`` provides the -# transform along with the pretrained model. For example, you can +# what the pretrained Tacotron2 model expects. ``torchaudio`` provides the same +# transform along with the pretrained model. You can # instantiate and use such transform as follow. # @@ -133,12 +129,12 @@ def text_to_sequence(text): ###################################################################### -# The ``processor`` object takes either a text or list of texts as inputs. +# Note: The output of our manual encoding and the ``torchaudio`` ``text_processor`` output matches (meaning we correctly re-implemented what the library does internally). It takes either a text or list of texts as inputs. # When a list of texts are provided, the returned ``lengths`` variable # represents the valid length of each processed tokens in the output # batch. # -# The intermediate representation can be retrieved as follow. +# The intermediate representation can be retrieved as follows: # print([processor.tokens[i] for i in processed[0, : lengths[0]]]) @@ -152,7 +148,7 @@ def text_to_sequence(text): # uses a symbol table based on phonemes and a G2P (Grapheme-to-Phoneme) # model. # -# The detail of the G2P model is out of scope of this tutorial, we will +# The detail of the G2P model is out of the scope of this tutorial, we will # just look at what the conversion looks like. # # Similar to the case of character-based encoding, the encoding process is @@ -195,7 +191,7 @@ def text_to_sequence(text): # encoded text. For the detail of the model, please refer to `the # paper `__. # -# It is easy to instantiate a Tacotron2 model with pretrained weight, +# It is easy to instantiate a Tacotron2 model with pretrained weights, # however, note that the input to Tacotron2 models need to be processed # by the matching text processor. # @@ -224,7 +220,7 @@ def text_to_sequence(text): ###################################################################### # Note that ``Tacotron2.infer`` method perfoms multinomial sampling, -# therefor, the process of generating the spectrogram incurs randomness. +# therefore, the process of generating the spectrogram incurs randomness. # @@ -245,7 +241,7 @@ def plot(): # ------------------- # # Once the spectrogram is generated, the last process is to recover the -# waveform from the spectrogram. +# waveform from the spectrogram using a vocoder. # # ``torchaudio`` provides vocoders based on ``GriffinLim`` and # ``WaveRNN``. @@ -253,8 +249,8 @@ def plot(): ###################################################################### -# WaveRNN -# ~~~~~~~ +# WaveRNN Vocoder +# ~~~~~~~~~~~~~~~ # # Continuing from the previous section, we can instantiate the matching # WaveRNN model from the same bundle. @@ -294,11 +290,11 @@ def plot(waveforms, spec, sample_rate): ###################################################################### -# Griffin-Lim -# ~~~~~~~~~~~ +# Griffin-Lim Vocoder +# ~~~~~~~~~~~~~~~~~~~ # # Using the Griffin-Lim vocoder is same as WaveRNN. You can instantiate -# the vocode object with +# the vocoder object with # :py:func:`~torchaudio.pipelines.Tacotron2TTSBundle.get_vocoder` # method and pass the spectrogram. # @@ -323,8 +319,8 @@ def plot(waveforms, spec, sample_rate): ###################################################################### -# Waveglow -# ~~~~~~~~ +# Waveglow Vocoder +# ~~~~~~~~~~~~~~~~ # # Waveglow is a vocoder published by Nvidia. The pretrained weights are # published on Torch Hub. One can instantiate the model using ``torch.hub``