From a6c2552189e964c736dc1a4caffc1e32ee017b58 Mon Sep 17 00:00:00 2001 From: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Date: Thu, 9 Feb 2023 14:47:02 -0800 Subject: [PATCH] [G2P] fixed typos and broken import library. (#5978) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> --- nemo_text_processing/g2p/data/data_utils.py | 1 + .../g2p/data/test_data_utils.py | 17 +++++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/nemo_text_processing/g2p/data/data_utils.py b/nemo_text_processing/g2p/data/data_utils.py index d695ec9ba391..8d9db35a1808 100644 --- a/nemo_text_processing/g2p/data/data_utils.py +++ b/nemo_text_processing/g2p/data/data_utils.py @@ -35,6 +35,7 @@ "GRAPHEME_CASE_UPPER", "GRAPHEME_CASE_LOWER", "GRAPHEME_CASE_MIXED", + "get_heteronym_spans", ] # Derived from LJSpeech diff --git a/tests/nemo_text_processing/g2p/data/test_data_utils.py b/tests/nemo_text_processing/g2p/data/test_data_utils.py index c04a4d0f0e13..5986556663cb 100644 --- a/tests/nemo_text_processing/g2p/data/test_data_utils.py +++ b/tests/nemo_text_processing/g2p/data/test_data_utils.py @@ -16,7 +16,7 @@ from nemo_text_processing.g2p.data.data_utils import ( any_locale_word_tokenize, english_word_tokenize, - get_homograph_spans, + get_heteronym_spans, ) @@ -95,7 +95,7 @@ def test_any_locale_word_tokenize_with_accents(self): @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_any_locale_word_tokenize_with_numbers(self): - input_text = "Three times× four^teen ÷divided by [movies] on \slash." + input_text = r"Three times× four^teen ÷divided by [movies] on \slash." expected_output = self._create_expected_output( [ "three", @@ -124,10 +124,11 @@ def test_any_locale_word_tokenize_with_numbers(self): @pytest.mark.run_only_on('CPU') @pytest.mark.unit - def test_get_homograph_spans(self): - supported_homographs = ["live", "read", "protest", "diffuse", "desert"] + def test_get_heteronym_spans(self): + supported_heteronyms = ["live", "read", "protest", "diffuse", "desert"] sentences = [ - "I live in California. I READ a book. Only people who have already gained something are willing to protest. He reads a book!", + "I live in California. I READ a book. Only people who have already gained something are willing to protest." + " He reads a book!", "Yesterday, I read a book.", "He read a book last night and pre-diffuse and LivE-post and pre-desert-post.", "the soldier deserted the desert in desert.", @@ -139,13 +140,13 @@ def test_get_homograph_spans(self): [(3, 7), (34, 41), (46, 50), (64, 70)], [(25, 31), (35, 41)], ] - expected_homographs = [ + expected_heteronyms = [ ["live", "read", "protest"], ['read'], ['read', 'diffuse', 'live', 'desert'], ['desert', 'desert'], ] - out_start_end, out_homographs = get_homograph_spans(sentences, supported_homographs) + out_start_end, out_heteronyms = get_heteronym_spans(sentences, supported_heteronyms) assert out_start_end == expected_start_end, "start-end spans do not match" - assert out_homographs == expected_homographs, "homograph spans do not match" + assert out_heteronyms == expected_heteronyms, "heteronym spans do not match"