Skip to content

Commit

Permalink
[G2P] added backward compatibility for english tokenizer and fixed un…
Browse files Browse the repository at this point in the history
…it tests (NVIDIA#5980) (NVIDIA#5984)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
  • Loading branch information
2 people authored and titu1994 committed Mar 24, 2023
1 parent 1a1ae42 commit ec3da51
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 5 deletions.
10 changes: 7 additions & 3 deletions nemo_text_processing/g2p/data/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def normalize_unicode_text(text: str) -> str:
return text


def _word_tokenize(words: List[Tuple[str, str, str]]) -> List[Tuple[List[str], bool]]:
def _word_tokenize(words: List[Tuple[str, str, str]], is_lower: bool = False) -> List[Tuple[List[str], bool]]:
"""
Process a list of words and attach indicators showing if each word is unchangeable or not. Each word representation
can be one of valid word, any substring starting from | to | (unchangeable word), or punctuation marks including
Expand Down Expand Up @@ -245,6 +245,7 @@ def _word_tokenize(words: List[Tuple[str, str, str]]) -> List[Tuple[List[str], b
Args:
words (List[str]): a list of tuples like `(maybe_word, maybe_without_changes, maybe_punct)` where each element
corresponds to a non-overlapping match of either `_WORDS_RE_EN` or `_WORDS_RE_ANY_LOCALE`.
is_lower (bool): a flag to trigger lowercase all words. By default, it is False.
Returns: List[Tuple[List[str], bool]], a list of tuples like `(a list of words, is_unchanged)`.
Expand All @@ -255,7 +256,10 @@ def _word_tokenize(words: List[Tuple[str, str, str]]) -> List[Tuple[List[str], b

without_changes = False
if maybe_word != '':
token = [maybe_word]
if is_lower:
token = [maybe_word.lower()]
else:
token = [maybe_word]
elif maybe_punct != '':
token = [maybe_punct]
elif maybe_without_changes != '':
Expand All @@ -274,7 +278,7 @@ def _word_tokenize(words: List[Tuple[str, str, str]]) -> List[Tuple[List[str], b

def english_word_tokenize(text: str) -> List[Tuple[List[str], bool]]:
words = _WORDS_RE_EN.findall(text)
return _word_tokenize(words)
return _word_tokenize(words, is_lower=True)


def any_locale_word_tokenize(text: str) -> List[Tuple[List[str], bool]]:
Expand Down
4 changes: 2 additions & 2 deletions tests/nemo_text_processing/g2p/data/test_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def test_any_locale_word_tokenize(self):
def test_any_locale_word_tokenize_with_accents(self):
input_text = "The naïve piñata at the café..."
expected_output = self._create_expected_output(
["the", " ", "naïve", " ", "piñata", " ", "at", " ", "the", " ", "café", "..."]
["The", " ", "naïve", " ", "piñata", " ", "at", " ", "the", " ", "café", "..."]
)

output = any_locale_word_tokenize(input_text)
Expand All @@ -98,7 +98,7 @@ def test_any_locale_word_tokenize_with_numbers(self):
input_text = r"Three times× four^teen ÷divided by [movies] on \slash."
expected_output = self._create_expected_output(
[
"three",
"Three",
" ",
"times",
"× ",
Expand Down

0 comments on commit ec3da51

Please sign in to comment.