initial port (#1)

code extracted from ovos-classifiers for better separation of concerns
OpenVoiceOS · Aug 5, 2024 · d1ece9c · d1ece9c
1 parent 4df860f
commit d1ece9c
Show file tree

Hide file tree

Showing 23 changed files with 3,547 additions and 0 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,3 @@
+include requirements.txt
+include LICENSE
+recursive-include ovos_utterance_normalizer *
diff --git a/ovos_utterance_normalizer/__init__.py b/ovos_utterance_normalizer/__init__.py
@@ -0,0 +1,61 @@
+import string
+from typing import Optional, List
+from ovos_utterance_normalizer.normalizer import Normalizer, CatalanNormalizer, CzechNormalizer, \
+    PortugueseNormalizer, AzerbaijaniNormalizer, RussianNormalizer, EnglishNormalizer, UkrainianNormalizer, \
+    GermanNormalizer
+from ovos_plugin_manager.templates.transformers import UtteranceTransformer
+
+
+class UtteranceNormalizerPlugin(UtteranceTransformer):
+    """plugin to normalize utterances by normalizing numbers, punctuation and contractions
+    language specific pre-processing is handled here too
+    this helps intent parsers"""
+
+    def __init__(self, name="ovos-utterance-normalizer", priority=1):
+        super().__init__(name, priority)
+
+    @staticmethod
+    def get_normalizer(lang: str):
+        if lang.startswith("en"):
+            return EnglishNormalizer()
+        elif lang.startswith("pt"):
+            return PortugueseNormalizer()
+        elif lang.startswith("uk"):
+            return UkrainianNormalizer()
+        elif lang.startswith("ca"):
+            return CatalanNormalizer()
+        elif lang.startswith("cz"):
+            return CzechNormalizer()
+        elif lang.startswith("az"):
+            return AzerbaijaniNormalizer()
+        elif lang.startswith("ru"):
+            return RussianNormalizer()
+        elif lang.startswith("de"):
+            return GermanNormalizer()
+        return Normalizer()
+
+    @staticmethod
+    def strip_punctuation(utterance: str):
+        return utterance.strip(string.punctuation).strip()
+
+    def transform(self, utterances: List[str],
+                  context: Optional[dict] = None) -> (list, dict):
+        context = context or {}
+        lang = context.get("lang") or self.config.get("lang", "en-us")
+        normalizer = self.get_normalizer(lang)
+
+        norm = []
+        # 1 - expand contractions
+        # 2 - original utterance
+        # 3 - normalized utterance
+        for u in utterances:
+            norm.append(normalizer.expand_contractions(u))
+            norm.append(u)
+            norm.append(normalizer.normalize(u))
+
+        if self.config.get("strip_punctuation", True):
+            norm = [self.strip_punctuation(u) for u in norm]
+
+        # this deduplicates the list while keeping order
+        return list(dict.fromkeys(norm)), context
+
diff --git a/ovos_utterance_normalizer/normalizer.py b/ovos_utterance_normalizer/normalizer.py
@@ -0,0 +1,235 @@
+import json
+import re
+from os.path import dirname
+from typing import List, Dict
+
+from ovos_utterance_normalizer.tokenization import word_tokenize
+from ovos_utterance_normalizer.numeric import EnglishNumberParser, AzerbaijaniNumberParser, GermanNumberParser
+
+
+class Normalizer:
+    # taken from lingua_franca
+    """
+    individual languages may subclass this if needed
+
+    normalize_XX should pass a valid config read from json
+    """
+    _default_config = {}
+
+    def __init__(self, config=None):
+        self.config = config or self._default_config
+
+    @staticmethod
+    def tokenize(utterance) -> List[str]:
+        return word_tokenize(utterance)
+
+    @property
+    def should_lowercase(self) -> bool:
+        return self.config.get("lowercase", False)
+
+    @property
+    def should_numbers_to_digits(self) -> bool:
+        return self.config.get("numbers_to_digits", True)
+
+    @property
+    def should_expand_contractions(self) -> bool:
+        return self.config.get("expand_contractions", True)
+
+    @property
+    def should_remove_symbols(self) -> bool:
+        return self.config.get("remove_symbols", False)
+
+    @property
+    def should_remove_accents(self) -> bool:
+        return self.config.get("remove_accents", False)
+
+    @property
+    def should_remove_articles(self) -> bool:
+        return self.config.get("remove_articles", False)
+
+    @property
+    def should_remove_stopwords(self) -> bool:
+        return self.config.get("remove_stopwords", False)
+
+    @property
+    def contractions(self) -> Dict[str, str]:
+        return self.config.get("contractions", {})
+
+    @property
+    def word_replacements(self) -> Dict[str, str]:
+        return self.config.get("word_replacements", {})
+
+    @property
+    def number_replacements(self) -> Dict[str, str]:
+        return self.config.get("number_replacements", {})
+
+    @property
+    def accents(self) -> Dict[str, str]:
+        return self.config.get("accents",
+                               {"á": "a", "à": "a", "ã": "a", "â": "a",
+                                "é": "e", "è": "e", "ê": "e", "ẽ": "e",
+                                "í": "i", "ì": "i", "î": "i", "ĩ": "i",
+                                "ò": "o", "ó": "o", "ô": "o", "õ": "o",
+                                "ú": "u", "ù": "u", "û": "u", "ũ": "u",
+                                "Á": "A", "À": "A", "Ã": "A", "Â": "A",
+                                "É": "E", "È": "E", "Ê": "E", "Ẽ": "E",
+                                "Í": "I", "Ì": "I", "Î": "I", "Ĩ": "I",
+                                "Ò": "O", "Ó": "O", "Ô": "O", "Õ": "O",
+                                "Ú": "U", "Ù": "U", "Û": "U", "Ũ": "U"
+                                })
+
+    @property
+    def stopwords(self) -> List[str]:
+        return self.config.get("stopwords", [])
+
+    @property
+    def articles(self) -> List[str]:
+        return self.config.get("articles", [])
+
+    @property
+    def symbols(self) -> List[str]:
+        return self.config.get("symbols",
+                               [";", "_", "!", "?", "<", ">", "|",
+                                "(", ")", "=", "[", "]", "{", "}",
+                                "»", "«", "*", "~", "^", "`", "\""])
+
+    def expand_contractions(self, utterance: str) -> str:
+        """ Expand common contractions, e.g. "isn't" -> "is not" """
+        words = self.tokenize(utterance)
+        for idx, w in enumerate(words):
+            if w in self.contractions:
+                words[idx] = self.contractions[w]
+        utterance = " ".join(words)
+        return utterance
+
+    def numbers_to_digits(self, utterance: str) -> str:
+        words = self.tokenize(utterance)
+        for idx, w in enumerate(words):
+            if w in self.number_replacements:
+                words[idx] = self.number_replacements[w]
+        utterance = " ".join(words)
+        return utterance
+
+    def remove_articles(self, utterance: str) -> str:
+        words = self.tokenize(utterance)
+        for idx, w in enumerate(words):
+            if w in self.articles:
+                words[idx] = ""
+        utterance = " ".join(words)
+        return utterance
+
+    def remove_stopwords(self, utterance: str) -> str:
+        words = self.tokenize(utterance)
+        for idx, w in enumerate(words):
+            if w in self.stopwords:
+                words[idx] = ""
+        # if words[-1] == '-':
+        #    words = words[:-1]
+        utterance = " ".join(words)
+        # Remove trailing whitespaces from utterance along with orphaned
+        # hyphens, more characters may be added later
+        utterance = re.sub(r'- *$', '', utterance)
+        return utterance
+
+    def remove_symbols(self, utterance: str) -> str:
+        mapping = str.maketrans('', '', "".join(self.symbols))
+        return utterance.translate(mapping)
+
+    def remove_accents(self, utterance : str) -> str:
+        for s in self.accents:
+            utterance = utterance.replace(s, self.accents[s])
+        return utterance
+
+    def replace_words(self, utterance: str) -> str:
+        words = self.tokenize(utterance)
+        for idx, w in enumerate(words):
+            if w in self.word_replacements:
+                words[idx] = self.word_replacements[w]
+        utterance = " ".join(words)
+        return utterance
+
+    def normalize(self, utterance: str = ""):
+        # mutations
+        if self.should_lowercase:
+            utterance = utterance.lower()
+        if self.should_expand_contractions:
+            utterance = self.expand_contractions(utterance)
+        if self.should_numbers_to_digits:
+            utterance = self.numbers_to_digits(utterance)
+        utterance = self.replace_words(utterance)
+
+        # removals
+        if self.should_remove_symbols:
+            utterance = self.remove_symbols(utterance)
+        if self.should_remove_accents:
+            utterance = self.remove_accents(utterance)
+        if self.should_remove_articles:
+            utterance = self.remove_articles(utterance)
+        if self.should_remove_stopwords:
+            utterance = self.remove_stopwords(utterance)
+        # remove extra spaces
+        utterance = " ".join([w for w in utterance.split(" ") if w])
+        return utterance
+
+
+class CatalanNormalizer(Normalizer):
+    with open(f"{dirname(dirname(__file__))}/res/ca/normalize.json") as f:
+        _default_config = json.load(f)
+
+    @staticmethod
+    def tokenize(utterance : str) -> List[str]:
+        return word_tokenize(utterance, lang="ca")
+
+
+class CzechNormalizer(Normalizer):
+    with open(f"{dirname(dirname(__file__))}/res/cz/normalize.json", encoding='utf8') as f:
+        _default_config = json.load(f)
+
+
+class PortugueseNormalizer(Normalizer):
+    with open(f"{dirname(dirname(__file__))}/res/pt/normalize.json") as f:
+        _default_config = json.load(f)
+
+    @staticmethod
+    def tokenize(utterance: str) -> List[str]:
+        return word_tokenize(utterance, lang="pt")
+
+
+class RussianNormalizer(Normalizer):
+    with open(f"{dirname(dirname(__file__))}/res/ru/normalize.json", encoding='utf8') as f:
+        _default_config = json.load(f)
+
+
+class UkrainianNormalizer(Normalizer):
+    with open(f"{dirname(dirname(__file__))}/res/uk/normalize.json", encoding='utf8') as f:
+        _default_config = json.load(f)
+
+
+class EnglishNormalizer(Normalizer):
+    with open(f"{dirname(dirname(__file__))}/res/en/normalize.json") as f:
+        _default_config = json.load(f)
+
+    def numbers_to_digits(self, utterance: str) -> str:
+        return EnglishNumberParser().convert_words_to_numbers(utterance)
+
+
+class AzerbaijaniNormalizer(Normalizer):
+    with open(f"{dirname(dirname(__file__))}/res/az/normalize.json") as f:
+        _default_config = json.load(f)
+
+    def numbers_to_digits(self, utterance: str) -> str:
+        return AzerbaijaniNumberParser().convert_words_to_numbers(utterance)
+
+
+class GermanNormalizer(Normalizer):
+    with open(f"{dirname(dirname(__file__))}/res/de/normalize.json") as f:
+        _default_config = json.load(f)
+
+    def numbers_to_digits(self, utterance: str) -> str:
+        return GermanNumberParser().convert_words_to_numbers(utterance)
+
+    def remove_symbols(self, utterance: str) -> str:
+        # special rule for hyphanated words in german as some STT engines falsely
+        # return them pretty regularly
+        utterance = re.sub(r'\b(\w*)-(\w*)\b', r'\1 \2', utterance)
+        return super().remove_symbols(utterance)