NVIDIA · XuesongYang · Oct 20, 2023 · Sep 25, 2023 · Sep 26, 2023 · Oct 2, 2023
diff --git a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
@@ -15,7 +15,7 @@
 
 # fmt: off
 
-SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT"]
+SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR"]
 
 DEFAULT_PUNCTUATION = (
     ',', '.', '!', '?', '-',
@@ -48,6 +48,13 @@
         'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
         'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ü', 'ẞ',
     ),
+    "fr-FR": (
+        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 
+        'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 
+        'U', 'V', 'W', 'X', 'Y', 'Z', 'À', 'Â', 'Ä', 'Æ', 
+        'Ç', 'È', 'É', 'Ê', 'Ë', 'Í', 'Î', 'Ï', 'Ñ', 'Ô', 
+        'Ö', 'Ù', 'Û', 'Ü', 'Ō', 'Œ',
+    ),
     "it-IT": (
         'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
         'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
@@ -77,6 +84,13 @@
         'ɒ', 'ɔ', 'ə', 'ɛ', 'ɜ', 'ɡ', 'ɪ', 'ɹ', 'ɾ', 'ʃ',
         'ʊ', 'ʌ', 'ʒ', '̃', 'θ'
     ),
+    "fr-FR": (
+        'a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l', 
+        'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 
+        'y', 'z', 'ð', 'ø', 'ŋ', 'œ', 'ɐ', 'ɑ', 'ɒ', 'ɔ', 
+        'ə', 'ɛ', 'ɜ', 'ɡ', 'ɪ', 'ɲ', 'ɹ', 'ʁ', 'ʃ', 'ʊ', 
+        'ʌ', 'ʒ', 'θ', 'ː', '̃'
+    ),
     "it-IT": (
         'a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l',
         'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w',
@@ -143,7 +157,7 @@ def get_ipa_punctuation_list(locale):
     punct_set = set(DEFAULT_PUNCTUATION)
     # TODO @xueyang: verify potential mismatches with locale-specific punctuation sets used
     #  in nemo_text_processing.text_normalization.en.taggers.punctuation.py
-    if locale in ["de-DE", "es-ES", "it-IT"]:
+    if locale in ["de-DE", "es-ES", "it-IT", "fr-FR"]:
         # ref: https://en.wikipedia.org/wiki/Guillemet#Uses
         punct_set.update(['«', '»', '‹', '›'])
     if locale == "de-DE":
@@ -190,6 +204,8 @@ def get_ipa_punctuation_list(locale):
     elif locale == "es-ES":
         # ref: https://en.wikipedia.org/wiki/Spanish_orthography#Punctuation
         punct_set.update(['¿', '¡'])
+    elif locale == "fr-FR":
+        punct_set.update(['–', '“', '”', '…', '̀', '́', '̂', '̈', '̧'])
 
     punct_list = sorted(list(punct_set))
     return punct_list
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py b/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py
@@ -19,6 +19,7 @@
 from typing import List, Tuple
 
 __all__ = [
+    "french_text_preprocessing",
     "chinese_text_preprocessing",
     "english_text_preprocessing",
     "any_locale_text_preprocessing",
@@ -196,3 +197,7 @@ def italian_text_preprocessing(text: str) -> str:
 
 def chinese_text_preprocessing(text: str) -> str:
     return text
+
+
+def french_text_preprocessing(text: str) -> str:
+    return text.lower()
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
@@ -28,6 +28,7 @@
     any_locale_text_preprocessing,
     chinese_text_preprocessing,
     english_text_preprocessing,
+    french_text_preprocessing,
     italian_text_preprocessing,
     spanish_text_preprocessing,
 )
@@ -268,6 +269,35 @@ def __init__(
         )
 
 
+class FrenchCharsTokenizer(BaseCharsTokenizer):
+
+    PUNCT_LIST = get_ipa_punctuation_list("fr-FR")
+
+    def __init__(
+        self, punct=True, apostrophe=True, add_blank_at=None, pad_with_space=False, non_default_punct_list=None,
+    ):
+        """French grapheme tokenizer.
+        Args:
+            punct: Whether to reserve grapheme for basic punctuation or not.
+            apostrophe: Whether to use apostrophe or not.
+            add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
+            if None then no blank in labels.
+            pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
+            non_default_punct_list: List of punctuation marks which will be used instead default.
+        """
+
+        fr_alphabet = get_grapheme_character_set(locale="fr-FR", case="lower")
+        super().__init__(
+            chars=fr_alphabet,
+            punct=punct,
+            apostrophe=apostrophe,
+            add_blank_at=add_blank_at,
+            pad_with_space=pad_with_space,
+            non_default_punct_list=non_default_punct_list,
+            text_preprocessing_func=french_text_preprocessing,
+        )
+
+
 class ItalianCharsTokenizer(BaseCharsTokenizer):
     PUNCT_LIST = get_ipa_punctuation_list("it-IT")
 
@@ -619,7 +649,7 @@ def __init__(
         Args:
             g2p: Grapheme to phoneme module, should be IpaG2p or some subclass thereof.
             locale: Locale used to determine default text processing logic and punctuation.
-                Supports ["en-US", "de-DE", "es-ES"]. Defaults to "en-US".
+                Supports ["en-US", "de-DE", "es-ES", "fr-FR"]. Defaults to "en-US".
                 Specify None if implementing custom logic for a new locale.
             punct: Whether to reserve grapheme for basic punctuation or not.
             non_default_punct_list: List of punctuation marks which will be used instead default, if any.

diff --git a/nemo/collections/tts/g2p/models/i18n_ipa.py b/nemo/collections/tts/g2p/models/i18n_ipa.py
@@ -405,6 +405,27 @@ def parse_one_word(self, word: str) -> Tuple[List[str], bool]:
                     else:
                         return self.phoneme_dict[word_found][0] + ["z"], True
 
+        if self.locale == "fr-FR":
+            # contracted prefix (with apostrophe) - not in phoneme dict
+            contractions_g = ['l', 'c', 'd', 'j', 'm', 'n', 'qu', 's', 't', 'puisqu', 'lorsqu', 'jusqu']
+            contractions_p = ['l', 's', 'd', 'ʒ', 'm', 'n', 'k', 's', 't', 'pyisk', 'loʁsk', 'ʒysk']
+
+            for cont_g, cont_p in zip(contractions_g, contractions_p):
+                starter = cont_g + "'"
+                if len(word) > 2 and (word.startswith(starter) or word.startswith(starter.upper())):
+                    word_found = None
+                    if (word not in self.phoneme_dict) and (word.upper() not in self.phoneme_dict):
+                        start_index = len(starter)
+                        if word[start_index:] in self.phoneme_dict:
+                            word_found = word[start_index:]
+                        elif word[start_index:].upper() in self.phoneme_dict:
+                            word_found = word[start_index:].upper()
+
+                    if word_found is not None and (
+                        not self.ignore_ambiguous_words or self.is_unique_in_phoneme_dict(word_found)
+                    ):
+                        return [c for c in cont_p] + self.phoneme_dict[word_found][0], True
+
         # For the words that have a single pronunciation, directly look it up in the phoneme_dict; for the
         # words that have multiple pronunciation variants, if we don't want to ignore them, then directly choose their
         # first pronunciation variant as the target phonemes.

diff --git a/tests/collections/common/tokenizers/text_to_speech/test_tokenizer_utils.py b/tests/collections/common/tokenizers/text_to_speech/test_tokenizer_utils.py
@@ -16,6 +16,7 @@
 from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import (
     any_locale_word_tokenize,
     english_word_tokenize,
+    french_text_preprocessing,
 )
 
 
@@ -120,3 +121,30 @@ def test_any_locale_word_tokenize_with_numbers(self):
 
         output = any_locale_word_tokenize(input_text)
         assert output == expected_output
+
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_french_text_preprocessing_lower(self):
+        input_text = "pomme banane poire"
+        expected_output = "pomme banane poire"
+
+        output = french_text_preprocessing(input_text)
+        assert output == expected_output
+
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_french_text_preprocessing_mixed(self):
+        input_text = "BONJOUR le Monde!"
+        expected_output = "bonjour le monde!"
+
+        output = french_text_preprocessing(input_text)
+        assert output == expected_output
+
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_french_text_preprocessing_upper(self):
+        input_text = "A BIENTÔT."
+        expected_output = "a bientôt."
+
+        output = french_text_preprocessing(input_text)
+        assert output == expected_output
diff --git a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py
@@ -16,6 +16,7 @@
 
 from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import (
     EnglishCharsTokenizer,
+    FrenchCharsTokenizer,
     GermanCharsTokenizer,
     IPATokenizer,
     ItalianCharsTokenizer,
@@ -118,6 +119,18 @@ def test_spanish_chars_tokenizer(self):
         assert chars == expected_output
         assert len(tokens) == len(input_text)
 
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_french_chars_tokenizer(self):
+        input_text = "Bon après-midi !"
+        expected_output = "bon après-midi !"
+
+        tokenizer = FrenchCharsTokenizer()
+        chars, tokens = self._parse_text(tokenizer, input_text)
+
+        assert chars == expected_output
+        assert len(tokens) == len(input_text)
+
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
     def test_ipa_tokenizer(self):