Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

French g2p with pronunciation dictionary #7601

Merged
merged 16 commits into from
Oct 20, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

# fmt: off

SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT"]
SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR"]

DEFAULT_PUNCTUATION = (
',', '.', '!', '?', '-',
Expand Down Expand Up @@ -48,6 +48,13 @@
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ü', 'ẞ',
),
"fr-FR": (
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
'U', 'V', 'W', 'X', 'Y', 'Z', 'À', 'Â', 'Ä', 'Æ',
'Ç', 'È', 'É', 'Ê', 'Ë', 'Í', 'Î', 'Ï', 'Ñ', 'Ô',
'Ö', 'Ù', 'Û', 'Ü', 'Ō', 'Œ',
),
"it-IT": (
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
Expand Down Expand Up @@ -77,6 +84,13 @@
'ɒ', 'ɔ', 'ə', 'ɛ', 'ɜ', 'ɡ', 'ɪ', 'ɹ', 'ɾ', 'ʃ',
'ʊ', 'ʌ', 'ʒ', '̃', 'θ'
),
"fr-FR": (
'a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l',
'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w',
'y', 'z', 'ð', 'ø', 'ŋ', 'œ', 'ɐ', 'ɑ', 'ɒ', 'ɔ',
'ə', 'ɛ', 'ɜ', 'ɡ', 'ɪ', 'ɲ', 'ɹ', 'ʁ', 'ʃ', 'ʊ',
'ʌ', 'ʒ', 'θ', 'ː', '̃'
),
"it-IT": (
'a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l',
'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w',
Expand Down Expand Up @@ -143,7 +157,7 @@ def get_ipa_punctuation_list(locale):
punct_set = set(DEFAULT_PUNCTUATION)
# TODO @xueyang: verify potential mismatches with locale-specific punctuation sets used
# in nemo_text_processing.text_normalization.en.taggers.punctuation.py
if locale in ["de-DE", "es-ES", "it-IT"]:
if locale in ["de-DE", "es-ES", "it-IT", "fr-FR"]:
# ref: https://en.wikipedia.org/wiki/Guillemet#Uses
punct_set.update(['«', '»', '‹', '›'])
if locale == "de-DE":
Expand Down Expand Up @@ -190,6 +204,8 @@ def get_ipa_punctuation_list(locale):
elif locale == "es-ES":
# ref: https://en.wikipedia.org/wiki/Spanish_orthography#Punctuation
punct_set.update(['¿', '¡'])
elif locale == "fr-FR":
punct_set.update(['–', '“', '”', '…', '̀', '́', '̂', '̈', '̧'])
XuesongYang marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding the extra punctuations used in French, could you pls add comments of unicode for each punctuation just as what "de-DE" did? It is not easy to pinpoint the difference between similar surfaces, such as below. Thanks!

'‒',  # figure dash, U+2012, decimal 8210
'–',  # en dash, U+2013, decimal 8211


punct_list = sorted(list(punct_set))
return punct_list
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from typing import List, Tuple

__all__ = [
"french_text_preprocessing",
"chinese_text_preprocessing",
"english_text_preprocessing",
"any_locale_text_preprocessing",
Expand Down Expand Up @@ -196,3 +197,7 @@ def italian_text_preprocessing(text: str) -> str:

def chinese_text_preprocessing(text: str) -> str:
return text


def french_text_preprocessing(text: str) -> str:
return text.lower()
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
any_locale_text_preprocessing,
chinese_text_preprocessing,
english_text_preprocessing,
french_text_preprocessing,
italian_text_preprocessing,
spanish_text_preprocessing,
)
Expand Down Expand Up @@ -268,6 +269,35 @@ def __init__(
)


class FrenchCharsTokenizer(BaseCharsTokenizer):

PUNCT_LIST = get_ipa_punctuation_list("fr-FR")

def __init__(
self, punct=True, apostrophe=True, add_blank_at=None, pad_with_space=False, non_default_punct_list=None,
):
"""French grapheme tokenizer.
Args:
punct: Whether to reserve grapheme for basic punctuation or not.
apostrophe: Whether to use apostrophe or not.
add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
if None then no blank in labels.
pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
non_default_punct_list: List of punctuation marks which will be used instead default.
"""

fr_alphabet = get_grapheme_character_set(locale="fr-FR", case="lower")
super().__init__(
chars=fr_alphabet,
punct=punct,
apostrophe=apostrophe,
add_blank_at=add_blank_at,
pad_with_space=pad_with_space,
non_default_punct_list=non_default_punct_list,
text_preprocessing_func=french_text_preprocessing,
)


class ItalianCharsTokenizer(BaseCharsTokenizer):
PUNCT_LIST = get_ipa_punctuation_list("it-IT")

Expand Down Expand Up @@ -619,7 +649,7 @@ def __init__(
Args:
g2p: Grapheme to phoneme module, should be IpaG2p or some subclass thereof.
locale: Locale used to determine default text processing logic and punctuation.
Supports ["en-US", "de-DE", "es-ES"]. Defaults to "en-US".
Supports ["en-US", "de-DE", "es-ES", "fr-FR"]. Defaults to "en-US".
XuesongYang marked this conversation as resolved.
Show resolved Hide resolved
Specify None if implementing custom logic for a new locale.
punct: Whether to reserve grapheme for basic punctuation or not.
non_default_punct_list: List of punctuation marks which will be used instead default, if any.
Expand Down
21 changes: 21 additions & 0 deletions nemo/collections/tts/g2p/models/i18n_ipa.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,27 @@ def parse_one_word(self, word: str) -> Tuple[List[str], bool]:
else:
return self.phoneme_dict[word_found][0] + ["z"], True

if self.locale == "fr-FR":
# contracted prefix (with apostrophe) - not in phoneme dict
contractions_g = ['l', 'c', 'd', 'j', 'm', 'n', 'qu', 's', 't', 'puisqu', 'lorsqu', 'jusqu']
contractions_p = ['l', 's', 'd', 'ʒ', 'm', 'n', 'k', 's', 't', 'pyisk', 'loʁsk', 'ʒysk']

for cont_g, cont_p in zip(contractions_g, contractions_p):
starter = cont_g + "'"
if len(word) > 2 and (word.startswith(starter) or word.startswith(starter.upper())):
word_found = None
if (word not in self.phoneme_dict) and (word.upper() not in self.phoneme_dict):
start_index = len(starter)
if word[start_index:] in self.phoneme_dict:
word_found = word[start_index:]
elif word[start_index:].upper() in self.phoneme_dict:
word_found = word[start_index:].upper()

if word_found is not None and (
not self.ignore_ambiguous_words or self.is_unique_in_phoneme_dict(word_found)
):
return [c for c in cont_p] + self.phoneme_dict[word_found][0], True

# For the words that have a single pronunciation, directly look it up in the phoneme_dict; for the
# words that have multiple pronunciation variants, if we don't want to ignore them, then directly choose their
# first pronunciation variant as the target phonemes.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import (
any_locale_word_tokenize,
english_word_tokenize,
french_text_preprocessing,
)


Expand Down Expand Up @@ -120,3 +121,30 @@ def test_any_locale_word_tokenize_with_numbers(self):

output = any_locale_word_tokenize(input_text)
assert output == expected_output

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_french_text_preprocessing_lower(self):
input_text = "pomme banane poire"
expected_output = "pomme banane poire"

output = french_text_preprocessing(input_text)
assert output == expected_output

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_french_text_preprocessing_mixed(self):
input_text = "BONJOUR le Monde!"
expected_output = "bonjour le monde!"

output = french_text_preprocessing(input_text)
assert output == expected_output

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_french_text_preprocessing_upper(self):
input_text = "A BIENTÔT."
expected_output = "a bientôt."

output = french_text_preprocessing(input_text)
assert output == expected_output
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these tests are for text_preprocessing funcs, not for any tokenize funcs. Could you pls revise following the above unit tests?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can remove the above three unit tests, and instead add test examples inside all functions related to any_locale_word_tokenize. For example, you could extend below input_text and expected_output as a list by adding fr-fr examples.

@pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_any_locale_word_tokenize(self):
        input_text = "apple banana pear"
        expected_output = self._create_expected_output(["apple", " ", "banana", " ", "pear"])

        output = any_locale_word_tokenize(input_text)
        assert output == expected_output

    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_any_locale_word_tokenize_with_accents(self):
        input_text = "The naïve piñata at the café..."
        expected_output = self._create_expected_output(
            ["The", " ", "naïve", " ", "piñata", " ", "at", " ", "the", " ", "café", "..."]
        )

        output = any_locale_word_tokenize(input_text)
        assert output == expected_output

    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_any_locale_word_tokenize_with_numbers(self):
        input_text = r"Three times× four^teen ÷divided by [movies] on \slash."
        expected_output = self._create_expected_output(
            [
                "Three",
                " ",
                "times",
                "× ",
                "four",
                "^",
                "teen",
                " ÷",
                "divided",
                " ",
                "by",
                " [",
                "movies",
                "] ",
                "on",
                " \\",
                "slash",
                ".",
            ]
        )

        output = any_locale_word_tokenize(input_text)
        assert output == expected_output

Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import (
EnglishCharsTokenizer,
FrenchCharsTokenizer,
GermanCharsTokenizer,
IPATokenizer,
ItalianCharsTokenizer,
Expand Down Expand Up @@ -118,6 +119,18 @@ def test_spanish_chars_tokenizer(self):
assert chars == expected_output
assert len(tokens) == len(input_text)

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_french_chars_tokenizer(self):
input_text = "Bon après-midi !"
expected_output = "bon après-midi !"

tokenizer = FrenchCharsTokenizer()
chars, tokens = self._parse_text(tokenizer, input_text)

assert chars == expected_output
assert len(tokens) == len(input_text)

XuesongYang marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_ipa_tokenizer(self):
Expand Down
Loading