[G2P] added backward compatibility for english tokenizer and fixed un…

…it tests (NVIDIA#5980) (NVIDIA#5984) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
titu1994 · Mar 24, 2023 · ec3da51 · ec3da51
1 parent 1a1ae42
commit ec3da51
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 5 deletions.
diff --git a/nemo_text_processing/g2p/data/data_utils.py b/nemo_text_processing/g2p/data/data_utils.py
@@ -214,7 +214,7 @@ def normalize_unicode_text(text: str) -> str:
     return text
 
 
-def _word_tokenize(words: List[Tuple[str, str, str]]) -> List[Tuple[List[str], bool]]:
+def _word_tokenize(words: List[Tuple[str, str, str]], is_lower: bool = False) -> List[Tuple[List[str], bool]]:
     """
     Process a list of words and attach indicators showing if each word is unchangeable or not. Each word representation
     can be one of valid word, any substring starting from | to | (unchangeable word), or punctuation marks including
@@ -245,6 +245,7 @@ def _word_tokenize(words: List[Tuple[str, str, str]]) -> List[Tuple[List[str], b
     Args:
         words (List[str]): a list of tuples like `(maybe_word, maybe_without_changes, maybe_punct)` where each element
             corresponds to a non-overlapping match of either `_WORDS_RE_EN` or `_WORDS_RE_ANY_LOCALE`.
+        is_lower (bool): a flag to trigger lowercase all words. By default, it is False.
 
     Returns: List[Tuple[List[str], bool]], a list of tuples like `(a list of words, is_unchanged)`.
 
@@ -255,7 +256,10 @@ def _word_tokenize(words: List[Tuple[str, str, str]]) -> List[Tuple[List[str], b
 
         without_changes = False
         if maybe_word != '':
-            token = [maybe_word]
+            if is_lower:
+                token = [maybe_word.lower()]
+            else:
+                token = [maybe_word]
         elif maybe_punct != '':
             token = [maybe_punct]
         elif maybe_without_changes != '':
@@ -274,7 +278,7 @@ def _word_tokenize(words: List[Tuple[str, str, str]]) -> List[Tuple[List[str], b
 
 def english_word_tokenize(text: str) -> List[Tuple[List[str], bool]]:
     words = _WORDS_RE_EN.findall(text)
-    return _word_tokenize(words)
+    return _word_tokenize(words, is_lower=True)
 
 
 def any_locale_word_tokenize(text: str) -> List[Tuple[List[str], bool]]:

diff --git a/tests/nemo_text_processing/g2p/data/test_data_utils.py b/tests/nemo_text_processing/g2p/data/test_data_utils.py
@@ -86,7 +86,7 @@ def test_any_locale_word_tokenize(self):
     def test_any_locale_word_tokenize_with_accents(self):
         input_text = "The naïve piñata at the café..."
         expected_output = self._create_expected_output(
-            ["the", " ", "naïve", " ", "piñata", " ", "at", " ", "the", " ", "café", "..."]
+            ["The", " ", "naïve", " ", "piñata", " ", "at", " ", "the", " ", "café", "..."]
         )
 
         output = any_locale_word_tokenize(input_text)
@@ -98,7 +98,7 @@ def test_any_locale_word_tokenize_with_numbers(self):
         input_text = r"Three times× four^teen ÷divided by [movies] on \slash."
         expected_output = self._create_expected_output(
             [
-                "three",
+                "Three",
                 " ",
                 "times",
                 "× ",