From 90ca7b11dcd80c01cd2a5c30e4c4fc58af3b56e2 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 10 Feb 2023 01:19:25 -0800
Subject: [PATCH] [G2P] added backward compatibility for english tokenizer and
 fixed unit tests (#5980) (#5984)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
---
 nemo_text_processing/g2p/data/data_utils.py            | 10 +++++++---
 tests/nemo_text_processing/g2p/data/test_data_utils.py |  4 ++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/nemo_text_processing/g2p/data/data_utils.py b/nemo_text_processing/g2p/data/data_utils.py
index 8d9db35a1808..2e3924acdad1 100644
--- a/nemo_text_processing/g2p/data/data_utils.py
+++ b/nemo_text_processing/g2p/data/data_utils.py
@@ -214,7 +214,7 @@ def normalize_unicode_text(text: str) -> str:
     return text
 
 
-def _word_tokenize(words: List[Tuple[str, str, str]]) -> List[Tuple[List[str], bool]]:
+def _word_tokenize(words: List[Tuple[str, str, str]], is_lower: bool = False) -> List[Tuple[List[str], bool]]:
     """
     Process a list of words and attach indicators showing if each word is unchangeable or not. Each word representation
     can be one of valid word, any substring starting from | to | (unchangeable word), or punctuation marks including
@@ -245,6 +245,7 @@ def _word_tokenize(words: List[Tuple[str, str, str]]) -> List[Tuple[List[str], b
     Args:
         words (List[str]): a list of tuples like `(maybe_word, maybe_without_changes, maybe_punct)` where each element
             corresponds to a non-overlapping match of either `_WORDS_RE_EN` or `_WORDS_RE_ANY_LOCALE`.
+        is_lower (bool): a flag to trigger lowercase all words. By default, it is False.
 
     Returns: List[Tuple[List[str], bool]], a list of tuples like `(a list of words, is_unchanged)`.
 
@@ -255,7 +256,10 @@ def _word_tokenize(words: List[Tuple[str, str, str]]) -> List[Tuple[List[str], b
 
         without_changes = False
         if maybe_word != '':
-            token = [maybe_word]
+            if is_lower:
+                token = [maybe_word.lower()]
+            else:
+                token = [maybe_word]
         elif maybe_punct != '':
             token = [maybe_punct]
         elif maybe_without_changes != '':
@@ -274,7 +278,7 @@ def _word_tokenize(words: List[Tuple[str, str, str]]) -> List[Tuple[List[str], b
 
 def english_word_tokenize(text: str) -> List[Tuple[List[str], bool]]:
     words = _WORDS_RE_EN.findall(text)
-    return _word_tokenize(words)
+    return _word_tokenize(words, is_lower=True)
 
 
 def any_locale_word_tokenize(text: str) -> List[Tuple[List[str], bool]]:
diff --git a/tests/nemo_text_processing/g2p/data/test_data_utils.py b/tests/nemo_text_processing/g2p/data/test_data_utils.py
index 5986556663cb..eccff2568459 100644
--- a/tests/nemo_text_processing/g2p/data/test_data_utils.py
+++ b/tests/nemo_text_processing/g2p/data/test_data_utils.py
@@ -86,7 +86,7 @@ def test_any_locale_word_tokenize(self):
     def test_any_locale_word_tokenize_with_accents(self):
         input_text = "The naïve piñata at the café..."
         expected_output = self._create_expected_output(
-            ["the", " ", "naïve", " ", "piñata", " ", "at", " ", "the", " ", "café", "..."]
+            ["The", " ", "naïve", " ", "piñata", " ", "at", " ", "the", " ", "café", "..."]
         )
 
         output = any_locale_word_tokenize(input_text)
@@ -98,7 +98,7 @@ def test_any_locale_word_tokenize_with_numbers(self):
         input_text = r"Three times× four^teen ÷divided by [movies] on \slash."
         expected_output = self._create_expected_output(
             [
-                "three",
+                "Three",
                 " ",
                 "times",
                 "× ",