NVIDIA · XuesongYang · Feb 10, 2023 · Feb 9, 2023
diff --git a/nemo_text_processing/g2p/data/data_utils.py b/nemo_text_processing/g2p/data/data_utils.py
@@ -35,6 +35,7 @@
     "GRAPHEME_CASE_UPPER",
     "GRAPHEME_CASE_LOWER",
     "GRAPHEME_CASE_MIXED",
+    "get_heteronym_spans",
 ]
 
 # Derived from LJSpeech

diff --git a/tests/nemo_text_processing/g2p/data/test_data_utils.py b/tests/nemo_text_processing/g2p/data/test_data_utils.py
@@ -16,7 +16,7 @@
 from nemo_text_processing.g2p.data.data_utils import (
     any_locale_word_tokenize,
     english_word_tokenize,
-    get_homograph_spans,
+    get_heteronym_spans,
 )
 
 
@@ -95,7 +95,7 @@ def test_any_locale_word_tokenize_with_accents(self):
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
     def test_any_locale_word_tokenize_with_numbers(self):
-        input_text = "Three times× four^teen ÷divided by [movies] on \slash."
+        input_text = r"Three times× four^teen ÷divided by [movies] on \slash."
         expected_output = self._create_expected_output(
             [
                 "three",
@@ -124,10 +124,11 @@ def test_any_locale_word_tokenize_with_numbers(self):
 
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
-    def test_get_homograph_spans(self):
-        supported_homographs = ["live", "read", "protest", "diffuse", "desert"]
+    def test_get_heteronym_spans(self):
+        supported_heteronyms = ["live", "read", "protest", "diffuse", "desert"]
         sentences = [
-            "I live in California. I READ a book. Only people who have already gained something are willing to protest. He reads a book!",
+            "I live in California. I READ a book. Only people who have already gained something are willing to protest."
+            " He reads a book!",
             "Yesterday, I read a book.",
             "He read a book last night and pre-diffuse and LivE-post and pre-desert-post.",
             "the soldier deserted the desert in desert.",
@@ -139,13 +140,13 @@ def test_get_homograph_spans(self):
             [(3, 7), (34, 41), (46, 50), (64, 70)],
             [(25, 31), (35, 41)],
         ]
-        expected_homographs = [
+        expected_heteronyms = [
             ["live", "read", "protest"],
             ['read'],
             ['read', 'diffuse', 'live', 'desert'],
             ['desert', 'desert'],
         ]
 
-        out_start_end, out_homographs = get_homograph_spans(sentences, supported_homographs)
+        out_start_end, out_heteronyms = get_heteronym_spans(sentences, supported_heteronyms)
         assert out_start_end == expected_start_end, "start-end spans do not match"
-        assert out_homographs == expected_homographs, "homograph spans do not match"
+        assert out_heteronyms == expected_heteronyms, "heteronym spans do not match"