From 90ca7b11dcd80c01cd2a5c30e4c4fc58af3b56e2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 10 Feb 2023 01:19:25 -0800 Subject: [PATCH] [G2P] added backward compatibility for english tokenizer and fixed unit tests (#5980) (#5984) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> --- nemo_text_processing/g2p/data/data_utils.py | 10 +++++++--- tests/nemo_text_processing/g2p/data/test_data_utils.py | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/nemo_text_processing/g2p/data/data_utils.py b/nemo_text_processing/g2p/data/data_utils.py index 8d9db35a1808..2e3924acdad1 100644 --- a/nemo_text_processing/g2p/data/data_utils.py +++ b/nemo_text_processing/g2p/data/data_utils.py @@ -214,7 +214,7 @@ def normalize_unicode_text(text: str) -> str: return text -def _word_tokenize(words: List[Tuple[str, str, str]]) -> List[Tuple[List[str], bool]]: +def _word_tokenize(words: List[Tuple[str, str, str]], is_lower: bool = False) -> List[Tuple[List[str], bool]]: """ Process a list of words and attach indicators showing if each word is unchangeable or not. Each word representation can be one of valid word, any substring starting from | to | (unchangeable word), or punctuation marks including @@ -245,6 +245,7 @@ def _word_tokenize(words: List[Tuple[str, str, str]]) -> List[Tuple[List[str], b Args: words (List[str]): a list of tuples like `(maybe_word, maybe_without_changes, maybe_punct)` where each element corresponds to a non-overlapping match of either `_WORDS_RE_EN` or `_WORDS_RE_ANY_LOCALE`. + is_lower (bool): a flag to trigger lowercase all words. By default, it is False. Returns: List[Tuple[List[str], bool]], a list of tuples like `(a list of words, is_unchanged)`. @@ -255,7 +256,10 @@ def _word_tokenize(words: List[Tuple[str, str, str]]) -> List[Tuple[List[str], b without_changes = False if maybe_word != '': - token = [maybe_word] + if is_lower: + token = [maybe_word.lower()] + else: + token = [maybe_word] elif maybe_punct != '': token = [maybe_punct] elif maybe_without_changes != '': @@ -274,7 +278,7 @@ def _word_tokenize(words: List[Tuple[str, str, str]]) -> List[Tuple[List[str], b def english_word_tokenize(text: str) -> List[Tuple[List[str], bool]]: words = _WORDS_RE_EN.findall(text) - return _word_tokenize(words) + return _word_tokenize(words, is_lower=True) def any_locale_word_tokenize(text: str) -> List[Tuple[List[str], bool]]: diff --git a/tests/nemo_text_processing/g2p/data/test_data_utils.py b/tests/nemo_text_processing/g2p/data/test_data_utils.py index 5986556663cb..eccff2568459 100644 --- a/tests/nemo_text_processing/g2p/data/test_data_utils.py +++ b/tests/nemo_text_processing/g2p/data/test_data_utils.py @@ -86,7 +86,7 @@ def test_any_locale_word_tokenize(self): def test_any_locale_word_tokenize_with_accents(self): input_text = "The naïve piñata at the café..." expected_output = self._create_expected_output( - ["the", " ", "naïve", " ", "piñata", " ", "at", " ", "the", " ", "café", "..."] + ["The", " ", "naïve", " ", "piñata", " ", "at", " ", "the", " ", "café", "..."] ) output = any_locale_word_tokenize(input_text) @@ -98,7 +98,7 @@ def test_any_locale_word_tokenize_with_numbers(self): input_text = r"Three times× four^teen ÷divided by [movies] on \slash." expected_output = self._create_expected_output( [ - "three", + "Three", " ", "times", "× ",