-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
code extracted from ovos-classifiers for better separation of concerns
- Loading branch information
Showing
23 changed files
with
3,547 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
include requirements.txt | ||
include LICENSE | ||
recursive-include ovos_utterance_normalizer * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import string | ||
from typing import Optional, List | ||
from ovos_utterance_normalizer.normalizer import Normalizer, CatalanNormalizer, CzechNormalizer, \ | ||
PortugueseNormalizer, AzerbaijaniNormalizer, RussianNormalizer, EnglishNormalizer, UkrainianNormalizer, \ | ||
GermanNormalizer | ||
from ovos_plugin_manager.templates.transformers import UtteranceTransformer | ||
|
||
|
||
class UtteranceNormalizerPlugin(UtteranceTransformer): | ||
"""plugin to normalize utterances by normalizing numbers, punctuation and contractions | ||
language specific pre-processing is handled here too | ||
this helps intent parsers""" | ||
|
||
def __init__(self, name="ovos-utterance-normalizer", priority=1): | ||
super().__init__(name, priority) | ||
|
||
@staticmethod | ||
def get_normalizer(lang: str): | ||
if lang.startswith("en"): | ||
return EnglishNormalizer() | ||
elif lang.startswith("pt"): | ||
return PortugueseNormalizer() | ||
elif lang.startswith("uk"): | ||
return UkrainianNormalizer() | ||
elif lang.startswith("ca"): | ||
return CatalanNormalizer() | ||
elif lang.startswith("cz"): | ||
return CzechNormalizer() | ||
elif lang.startswith("az"): | ||
return AzerbaijaniNormalizer() | ||
elif lang.startswith("ru"): | ||
return RussianNormalizer() | ||
elif lang.startswith("de"): | ||
return GermanNormalizer() | ||
return Normalizer() | ||
|
||
@staticmethod | ||
def strip_punctuation(utterance: str): | ||
return utterance.strip(string.punctuation).strip() | ||
|
||
def transform(self, utterances: List[str], | ||
context: Optional[dict] = None) -> (list, dict): | ||
context = context or {} | ||
lang = context.get("lang") or self.config.get("lang", "en-us") | ||
normalizer = self.get_normalizer(lang) | ||
|
||
norm = [] | ||
# 1 - expand contractions | ||
# 2 - original utterance | ||
# 3 - normalized utterance | ||
for u in utterances: | ||
norm.append(normalizer.expand_contractions(u)) | ||
norm.append(u) | ||
norm.append(normalizer.normalize(u)) | ||
|
||
if self.config.get("strip_punctuation", True): | ||
norm = [self.strip_punctuation(u) for u in norm] | ||
|
||
# this deduplicates the list while keeping order | ||
return list(dict.fromkeys(norm)), context | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,235 @@ | ||
import json | ||
import re | ||
from os.path import dirname | ||
from typing import List, Dict | ||
|
||
from ovos_utterance_normalizer.tokenization import word_tokenize | ||
from ovos_utterance_normalizer.numeric import EnglishNumberParser, AzerbaijaniNumberParser, GermanNumberParser | ||
|
||
|
||
class Normalizer: | ||
# taken from lingua_franca | ||
""" | ||
individual languages may subclass this if needed | ||
normalize_XX should pass a valid config read from json | ||
""" | ||
_default_config = {} | ||
|
||
def __init__(self, config=None): | ||
self.config = config or self._default_config | ||
|
||
@staticmethod | ||
def tokenize(utterance) -> List[str]: | ||
return word_tokenize(utterance) | ||
|
||
@property | ||
def should_lowercase(self) -> bool: | ||
return self.config.get("lowercase", False) | ||
|
||
@property | ||
def should_numbers_to_digits(self) -> bool: | ||
return self.config.get("numbers_to_digits", True) | ||
|
||
@property | ||
def should_expand_contractions(self) -> bool: | ||
return self.config.get("expand_contractions", True) | ||
|
||
@property | ||
def should_remove_symbols(self) -> bool: | ||
return self.config.get("remove_symbols", False) | ||
|
||
@property | ||
def should_remove_accents(self) -> bool: | ||
return self.config.get("remove_accents", False) | ||
|
||
@property | ||
def should_remove_articles(self) -> bool: | ||
return self.config.get("remove_articles", False) | ||
|
||
@property | ||
def should_remove_stopwords(self) -> bool: | ||
return self.config.get("remove_stopwords", False) | ||
|
||
@property | ||
def contractions(self) -> Dict[str, str]: | ||
return self.config.get("contractions", {}) | ||
|
||
@property | ||
def word_replacements(self) -> Dict[str, str]: | ||
return self.config.get("word_replacements", {}) | ||
|
||
@property | ||
def number_replacements(self) -> Dict[str, str]: | ||
return self.config.get("number_replacements", {}) | ||
|
||
@property | ||
def accents(self) -> Dict[str, str]: | ||
return self.config.get("accents", | ||
{"á": "a", "à": "a", "ã": "a", "â": "a", | ||
"é": "e", "è": "e", "ê": "e", "ẽ": "e", | ||
"í": "i", "ì": "i", "î": "i", "ĩ": "i", | ||
"ò": "o", "ó": "o", "ô": "o", "õ": "o", | ||
"ú": "u", "ù": "u", "û": "u", "ũ": "u", | ||
"Á": "A", "À": "A", "Ã": "A", "Â": "A", | ||
"É": "E", "È": "E", "Ê": "E", "Ẽ": "E", | ||
"Í": "I", "Ì": "I", "Î": "I", "Ĩ": "I", | ||
"Ò": "O", "Ó": "O", "Ô": "O", "Õ": "O", | ||
"Ú": "U", "Ù": "U", "Û": "U", "Ũ": "U" | ||
}) | ||
|
||
@property | ||
def stopwords(self) -> List[str]: | ||
return self.config.get("stopwords", []) | ||
|
||
@property | ||
def articles(self) -> List[str]: | ||
return self.config.get("articles", []) | ||
|
||
@property | ||
def symbols(self) -> List[str]: | ||
return self.config.get("symbols", | ||
[";", "_", "!", "?", "<", ">", "|", | ||
"(", ")", "=", "[", "]", "{", "}", | ||
"»", "«", "*", "~", "^", "`", "\""]) | ||
|
||
def expand_contractions(self, utterance: str) -> str: | ||
""" Expand common contractions, e.g. "isn't" -> "is not" """ | ||
words = self.tokenize(utterance) | ||
for idx, w in enumerate(words): | ||
if w in self.contractions: | ||
words[idx] = self.contractions[w] | ||
utterance = " ".join(words) | ||
return utterance | ||
|
||
def numbers_to_digits(self, utterance: str) -> str: | ||
words = self.tokenize(utterance) | ||
for idx, w in enumerate(words): | ||
if w in self.number_replacements: | ||
words[idx] = self.number_replacements[w] | ||
utterance = " ".join(words) | ||
return utterance | ||
|
||
def remove_articles(self, utterance: str) -> str: | ||
words = self.tokenize(utterance) | ||
for idx, w in enumerate(words): | ||
if w in self.articles: | ||
words[idx] = "" | ||
utterance = " ".join(words) | ||
return utterance | ||
|
||
def remove_stopwords(self, utterance: str) -> str: | ||
words = self.tokenize(utterance) | ||
for idx, w in enumerate(words): | ||
if w in self.stopwords: | ||
words[idx] = "" | ||
# if words[-1] == '-': | ||
# words = words[:-1] | ||
utterance = " ".join(words) | ||
# Remove trailing whitespaces from utterance along with orphaned | ||
# hyphens, more characters may be added later | ||
utterance = re.sub(r'- *$', '', utterance) | ||
return utterance | ||
|
||
def remove_symbols(self, utterance: str) -> str: | ||
mapping = str.maketrans('', '', "".join(self.symbols)) | ||
return utterance.translate(mapping) | ||
|
||
def remove_accents(self, utterance : str) -> str: | ||
for s in self.accents: | ||
utterance = utterance.replace(s, self.accents[s]) | ||
return utterance | ||
|
||
def replace_words(self, utterance: str) -> str: | ||
words = self.tokenize(utterance) | ||
for idx, w in enumerate(words): | ||
if w in self.word_replacements: | ||
words[idx] = self.word_replacements[w] | ||
utterance = " ".join(words) | ||
return utterance | ||
|
||
def normalize(self, utterance: str = ""): | ||
# mutations | ||
if self.should_lowercase: | ||
utterance = utterance.lower() | ||
if self.should_expand_contractions: | ||
utterance = self.expand_contractions(utterance) | ||
if self.should_numbers_to_digits: | ||
utterance = self.numbers_to_digits(utterance) | ||
utterance = self.replace_words(utterance) | ||
|
||
# removals | ||
if self.should_remove_symbols: | ||
utterance = self.remove_symbols(utterance) | ||
if self.should_remove_accents: | ||
utterance = self.remove_accents(utterance) | ||
if self.should_remove_articles: | ||
utterance = self.remove_articles(utterance) | ||
if self.should_remove_stopwords: | ||
utterance = self.remove_stopwords(utterance) | ||
# remove extra spaces | ||
utterance = " ".join([w for w in utterance.split(" ") if w]) | ||
return utterance | ||
|
||
|
||
class CatalanNormalizer(Normalizer): | ||
with open(f"{dirname(dirname(__file__))}/res/ca/normalize.json") as f: | ||
_default_config = json.load(f) | ||
|
||
@staticmethod | ||
def tokenize(utterance : str) -> List[str]: | ||
return word_tokenize(utterance, lang="ca") | ||
|
||
|
||
class CzechNormalizer(Normalizer): | ||
with open(f"{dirname(dirname(__file__))}/res/cz/normalize.json", encoding='utf8') as f: | ||
_default_config = json.load(f) | ||
|
||
|
||
class PortugueseNormalizer(Normalizer): | ||
with open(f"{dirname(dirname(__file__))}/res/pt/normalize.json") as f: | ||
_default_config = json.load(f) | ||
|
||
@staticmethod | ||
def tokenize(utterance: str) -> List[str]: | ||
return word_tokenize(utterance, lang="pt") | ||
|
||
|
||
class RussianNormalizer(Normalizer): | ||
with open(f"{dirname(dirname(__file__))}/res/ru/normalize.json", encoding='utf8') as f: | ||
_default_config = json.load(f) | ||
|
||
|
||
class UkrainianNormalizer(Normalizer): | ||
with open(f"{dirname(dirname(__file__))}/res/uk/normalize.json", encoding='utf8') as f: | ||
_default_config = json.load(f) | ||
|
||
|
||
class EnglishNormalizer(Normalizer): | ||
with open(f"{dirname(dirname(__file__))}/res/en/normalize.json") as f: | ||
_default_config = json.load(f) | ||
|
||
def numbers_to_digits(self, utterance: str) -> str: | ||
return EnglishNumberParser().convert_words_to_numbers(utterance) | ||
|
||
|
||
class AzerbaijaniNormalizer(Normalizer): | ||
with open(f"{dirname(dirname(__file__))}/res/az/normalize.json") as f: | ||
_default_config = json.load(f) | ||
|
||
def numbers_to_digits(self, utterance: str) -> str: | ||
return AzerbaijaniNumberParser().convert_words_to_numbers(utterance) | ||
|
||
|
||
class GermanNormalizer(Normalizer): | ||
with open(f"{dirname(dirname(__file__))}/res/de/normalize.json") as f: | ||
_default_config = json.load(f) | ||
|
||
def numbers_to_digits(self, utterance: str) -> str: | ||
return GermanNumberParser().convert_words_to_numbers(utterance) | ||
|
||
def remove_symbols(self, utterance: str) -> str: | ||
# special rule for hyphanated words in german as some STT engines falsely | ||
# return them pretty regularly | ||
utterance = re.sub(r'\b(\w*)-(\w*)\b', r'\1 \2', utterance) | ||
return super().remove_symbols(utterance) |
Oops, something went wrong.