[FastPitch/PyT] updated checkpoints, multispeaker and text processing

NVIDIA · byshiue · Dec 13, 2020 · Jun 30, 2020 · Jun 30, 2020 · Aug 3, 2020
commit bec82593f59a4d60dba602c221d9ce13fac8c1e0
diff --git a/PyTorch/SpeechSynthesis/FastPitch/.gitignore b/PyTorch/SpeechSynthesis/FastPitch/.gitignore
@@ -1,8 +1,15 @@
-*.swp
-*.swo
-*.pyc
-__pycache__
-scripts_joc/
 runs*/
 LJSpeech-1.1/
 output*
+scripts_joc/
+tests/
+
+*.pyc
+__pycache__
+
+.idea/
+.DS_Store
+
+*.swp
+*.swo
+*.swn
diff --git a/PyTorch/SpeechSynthesis/FastPitch/README.md b/PyTorch/SpeechSynthesis/FastPitch/README.md
@@ -488,11 +488,11 @@ The `scripts/train.sh` script is configured for 8x GPU with at least 16GB of mem
     ```
 In a single accumulated step, there are `batch_size x gradient_accumulation_steps x GPUs = 256` examples being processed in parallel. With a smaller number of GPUs, increase `--gradient_accumulation_steps` to keep this relation satisfied, e.g., through env variables
     ```bash
-    NGPU=4 GRAD_ACC=2 bash scripts/train.sh
+    NUM_GPUS=4 GRAD_ACCUMULATION=2 bash scripts/train.sh
     ```
 With automatic mixed precision (AMP), a larger batch size fits in 16GB of memory:
     ```bash
-    NGPU=4 GRAD_ACC=1 BS=64 AMP=true bash scripts/train.sh
+    NUM_GPUS=4 GRAD_ACCUMULATION=1 BS=64 AMP=true bash scripts/train.sh
     ```
 
 ### Inference process
@@ -545,18 +545,18 @@ To benchmark the training performance on a specific batch size, run:
 
 * NVIDIA DGX A100 (8x A100 40GB)
     ```bash
-        AMP=true NGPU=1 BS=128 GRAD_ACC=2 EPOCHS=10 bash scripts/train.sh
-        AMP=true NGPU=8 BS=32 GRAD_ACC=1 EPOCHS=10 bash scripts/train.sh
-        NGPU=1 BS=128 GRAD_ACC=2 EPOCHS=10 bash scripts/train.sh
-        NGPU=8 BS=32 GRAD_ACC=1 EPOCHS=10 bash scripts/train.sh
+        AMP=true NUM_GPUS=1 BS=128 GRAD_ACCUMULATION=2 EPOCHS=10 bash scripts/train.sh
+        AMP=true NUM_GPUS=8 BS=32 GRAD_ACCUMULATION=1 EPOCHS=10 bash scripts/train.sh
+        NUM_GPUS=1 BS=128 GRAD_ACCUMULATION=2 EPOCHS=10 bash scripts/train.sh
+        NUM_GPUS=8 BS=32 GRAD_ACCUMULATION=1 EPOCHS=10 bash scripts/train.sh
     ```
 
 * NVIDIA DGX-1 (8x V100 16GB)
     ```bash
-        AMP=true NGPU=1 BS=64 GRAD_ACC=4 EPOCHS=10 bash scripts/train.sh
-        AMP=true NGPU=8 BS=32 GRAD_ACC=1 EPOCHS=10 bash scripts/train.sh
-        NGPU=1 BS=32 GRAD_ACC=8 EPOCHS=10 bash scripts/train.sh
-        NGPU=8 BS=32 GRAD_ACC=1 EPOCHS=10 bash scripts/train.sh
+        AMP=true NUM_GPUS=1 BS=64 GRAD_ACCUMULATION=4 EPOCHS=10 bash scripts/train.sh
+        AMP=true NUM_GPUS=8 BS=32 GRAD_ACCUMULATION=1 EPOCHS=10 bash scripts/train.sh
+        NUM_GPUS=1 BS=32 GRAD_ACCUMULATION=8 EPOCHS=10 bash scripts/train.sh
+        NUM_GPUS=8 BS=32 GRAD_ACCUMULATION=1 EPOCHS=10 bash scripts/train.sh
     ```
 
 Each of these scripts runs for 10 epochs and for each epoch measures the
@@ -569,12 +569,12 @@ To benchmark the inference performance on a specific batch size, run:
 
 * For FP16
     ```bash
-    AMP=true BS_SEQ=”1 4 8” REPEATS=100 bash scripts/inference_benchmark.sh
+    AMP=true BS_SEQUENCE=”1 4 8” REPEATS=100 bash scripts/inference_benchmark.sh
     ```
 
 * For FP32 or TF32
     ```bash
-    BS_SEQ=”1 4 8” REPEATS=100 bash scripts/inference_benchmark.sh
+    BS_SEQUENCE=”1 4 8” REPEATS=100 bash scripts/inference_benchmark.sh
     ```
 
 The output log files will contain performance numbers for the FastPitch model
@@ -726,6 +726,10 @@ The input utterance has 128 characters, synthesized audio has 8.05 s.
 
 ### Changelog
 
+October 2020
+- Added multispeaker capabilities
+- Updated text processing module
+
 June 2020
 - Updated performance tables to include A100 results
 

diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/text/__init__.py b/PyTorch/SpeechSynthesis/FastPitch/common/text/__init__.py
@@ -1,74 +1,3 @@
-""" from https://github.com/keithito/tacotron """
-import re
-from common.text import cleaners
-from common.text.symbols import symbols
+from .cmudict import CMUDict
 
-
-# Mappings from symbol to numeric ID and vice versa:
-_symbol_to_id = {s: i for i, s in enumerate(symbols)}
-_id_to_symbol = {i: s for i, s in enumerate(symbols)}
-
-# Regular expression matching text enclosed in curly braces:
-_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
-
-
-def text_to_sequence(text, cleaner_names):
-  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-
-    The text can optionally have ARPAbet sequences enclosed in curly braces embedded
-    in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
-
-    Args:
-      text: string to convert to a sequence
-      cleaner_names: names of the cleaner functions to run the text through
-
-    Returns:
-      List of integers corresponding to the symbols in the text
-  '''
-  sequence = []
-
-  # Check for curly braces and treat their contents as ARPAbet:
-  while len(text):
-    m = _curly_re.match(text)
-    if not m:
-      sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
-      break
-    sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
-    sequence += _arpabet_to_sequence(m.group(2))
-    text = m.group(3)
-
-  return sequence
-
-
-def sequence_to_text(sequence):
-  '''Converts a sequence of IDs back to a string'''
-  result = ''
-  for symbol_id in sequence:
-    if symbol_id in _id_to_symbol:
-      s = _id_to_symbol[symbol_id]
-      # Enclose ARPAbet back in curly braces:
-      if len(s) > 1 and s[0] == '@':
-        s = '{%s}' % s[1:]
-      result += s
-  return result.replace('}{', ' ')
-
-
-def _clean_text(text, cleaner_names):
-  for name in cleaner_names:
-    cleaner = getattr(cleaners, name)
-    if not cleaner:
-      raise Exception('Unknown cleaner: %s' % name)
-    text = cleaner(text)
-  return text
-
-
-def _symbols_to_sequence(symbols):
-  return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
-
-
-def _arpabet_to_sequence(text):
-  return _symbols_to_sequence(['@' + s for s in text.split()])
-
-
-def _should_keep_symbol(s):
-  return s in _symbol_to_id and s is not '_' and s is not '~'
+cmudict = CMUDict()
diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/text/abbreviations.py b/PyTorch/SpeechSynthesis/FastPitch/common/text/abbreviations.py
@@ -0,0 +1,58 @@
+import re
+
+_no_period_re = re.compile(r'(No[.])(?=[ ]?[0-9])')
+_percent_re = re.compile(r'([ ]?[%])')
+_half_re = re.compile('([0-9]½)|(½)')
+
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('mrs', 'misess'),
+    ('ms', 'miss'),
+    ('mr', 'mister'),
+    ('dr', 'doctor'),
+    ('st', 'saint'),
+    ('co', 'company'),
+    ('jr', 'junior'),
+    ('maj', 'major'),
+    ('gen', 'general'),
+    ('drs', 'doctors'),
+    ('rev', 'reverend'),
+    ('lt', 'lieutenant'),
+    ('hon', 'honorable'),
+    ('sgt', 'sergeant'),
+    ('capt', 'captain'),
+    ('esq', 'esquire'),
+    ('ltd', 'limited'),
+    ('col', 'colonel'),
+    ('ft', 'fort'),
+    ('sen', 'senator'),
+]]
+
+
+def _expand_no_period(m):
+    word = m.group(0)
+    if word[0] == 'N':
+        return 'Number'
+    return 'number'
+
+
+def _expand_percent(m):
+    return ' percent'
+
+
+def _expand_half(m):
+    word = m.group(1)
+    if word is None:
+        return 'half'
+    return word[0] + ' and a half'
+
+
+def normalize_abbreviations(text):
+    text = re.sub(_no_period_re, _expand_no_period, text)
+    text = re.sub(_percent_re, _expand_percent, text)
+    text = re.sub(_half_re, _expand_half, text)
+
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/text/acronyms.py b/PyTorch/SpeechSynthesis/FastPitch/common/text/acronyms.py
@@ -0,0 +1,67 @@
+import re
+from . import cmudict
+
+_letter_to_arpabet = {
+    'A': 'EY1',
+    'B': 'B IY1',
+    'C': 'S IY1',
+    'D': 'D IY1',
+    'E': 'IY1',
+    'F': 'EH1 F',
+    'G': 'JH IY1',
+    'H': 'EY1 CH',
+    'I': 'AY1',
+    'J': 'JH EY1',
+    'K': 'K EY1',
+    'L': 'EH1 L',
+    'M': 'EH1 M',
+    'N': 'EH1 N',
+    'O': 'OW1',
+    'P': 'P IY1',
+    'Q': 'K Y UW1',
+    'R': 'AA1 R',
+    'S': 'EH1 S',
+    'T': 'T IY1',
+    'U': 'Y UW1',
+    'V': 'V IY1',
+    'X': 'EH1 K S',
+    'Y': 'W AY1',
+    'W': 'D AH1 B AH0 L Y UW0',
+    'Z': 'Z IY1',
+    's': 'Z'
+}
+
+# must ignore roman numerals
+# _acronym_re = re.compile(r'([A-Z][A-Z]+)s?|([A-Z]\.([A-Z]\.)+s?)')
+_acronym_re = re.compile(r'([A-Z][A-Z]+)s?')
+
+
+def _expand_acronyms(m, add_spaces=True):
+    acronym = m.group(0)
+
+    # remove dots if they exist
+    acronym = re.sub('\.', '', acronym)
+
+    acronym = "".join(acronym.split())
+    arpabet = cmudict.lookup(acronym)
+
+    if arpabet is None:
+        acronym = list(acronym)
+        arpabet = ["{" + _letter_to_arpabet[letter] + "}" for letter in acronym]
+        # temporary fix
+        if arpabet[-1] == '{Z}' and len(arpabet) > 1:
+            arpabet[-2] = arpabet[-2][:-1] + ' ' + arpabet[-1][1:]
+            del arpabet[-1]
+
+        arpabet = ' '.join(arpabet)
+    elif len(arpabet) == 1:
+        arpabet = "{" + arpabet[0] + "}"
+    else:
+        arpabet = acronym
+
+    return arpabet
+
+
+def normalize_acronyms(text):
+    text = re.sub(_acronym_re, _expand_acronyms, text)
+    return text