Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

initial port #1

Merged
merged 1 commit into from
Aug 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
include requirements.txt
include LICENSE
recursive-include ovos_utterance_normalizer *
61 changes: 61 additions & 0 deletions ovos_utterance_normalizer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import string
from typing import Optional, List
from ovos_utterance_normalizer.normalizer import Normalizer, CatalanNormalizer, CzechNormalizer, \
PortugueseNormalizer, AzerbaijaniNormalizer, RussianNormalizer, EnglishNormalizer, UkrainianNormalizer, \
GermanNormalizer
from ovos_plugin_manager.templates.transformers import UtteranceTransformer


class UtteranceNormalizerPlugin(UtteranceTransformer):
"""plugin to normalize utterances by normalizing numbers, punctuation and contractions
language specific pre-processing is handled here too
this helps intent parsers"""

def __init__(self, name="ovos-utterance-normalizer", priority=1):
super().__init__(name, priority)

@staticmethod
def get_normalizer(lang: str):
if lang.startswith("en"):
return EnglishNormalizer()
elif lang.startswith("pt"):
return PortugueseNormalizer()
elif lang.startswith("uk"):
return UkrainianNormalizer()
elif lang.startswith("ca"):
return CatalanNormalizer()
elif lang.startswith("cz"):
return CzechNormalizer()
elif lang.startswith("az"):
return AzerbaijaniNormalizer()
elif lang.startswith("ru"):
return RussianNormalizer()
elif lang.startswith("de"):
return GermanNormalizer()
return Normalizer()

@staticmethod
def strip_punctuation(utterance: str):
return utterance.strip(string.punctuation).strip()

def transform(self, utterances: List[str],
context: Optional[dict] = None) -> (list, dict):
context = context or {}
lang = context.get("lang") or self.config.get("lang", "en-us")
normalizer = self.get_normalizer(lang)

norm = []
# 1 - expand contractions
# 2 - original utterance
# 3 - normalized utterance
for u in utterances:
norm.append(normalizer.expand_contractions(u))
norm.append(u)
norm.append(normalizer.normalize(u))

if self.config.get("strip_punctuation", True):
norm = [self.strip_punctuation(u) for u in norm]

# this deduplicates the list while keeping order
return list(dict.fromkeys(norm)), context

235 changes: 235 additions & 0 deletions ovos_utterance_normalizer/normalizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
import json
import re
from os.path import dirname
from typing import List, Dict

from ovos_utterance_normalizer.tokenization import word_tokenize
from ovos_utterance_normalizer.numeric import EnglishNumberParser, AzerbaijaniNumberParser, GermanNumberParser


class Normalizer:
# taken from lingua_franca
"""
individual languages may subclass this if needed

normalize_XX should pass a valid config read from json
"""
_default_config = {}

def __init__(self, config=None):
self.config = config or self._default_config

@staticmethod
def tokenize(utterance) -> List[str]:
return word_tokenize(utterance)

@property
def should_lowercase(self) -> bool:
return self.config.get("lowercase", False)

@property
def should_numbers_to_digits(self) -> bool:
return self.config.get("numbers_to_digits", True)

@property
def should_expand_contractions(self) -> bool:
return self.config.get("expand_contractions", True)

@property
def should_remove_symbols(self) -> bool:
return self.config.get("remove_symbols", False)

@property
def should_remove_accents(self) -> bool:
return self.config.get("remove_accents", False)

@property
def should_remove_articles(self) -> bool:
return self.config.get("remove_articles", False)

@property
def should_remove_stopwords(self) -> bool:
return self.config.get("remove_stopwords", False)

@property
def contractions(self) -> Dict[str, str]:
return self.config.get("contractions", {})

@property
def word_replacements(self) -> Dict[str, str]:
return self.config.get("word_replacements", {})

@property
def number_replacements(self) -> Dict[str, str]:
return self.config.get("number_replacements", {})

@property
def accents(self) -> Dict[str, str]:
return self.config.get("accents",
{"á": "a", "à": "a", "ã": "a", "â": "a",
"é": "e", "è": "e", "ê": "e", "ẽ": "e",
"í": "i", "ì": "i", "î": "i", "ĩ": "i",
"ò": "o", "ó": "o", "ô": "o", "õ": "o",
"ú": "u", "ù": "u", "û": "u", "ũ": "u",
"Á": "A", "À": "A", "Ã": "A", "Â": "A",
"É": "E", "È": "E", "Ê": "E", "Ẽ": "E",
"Í": "I", "Ì": "I", "Î": "I", "Ĩ": "I",
"Ò": "O", "Ó": "O", "Ô": "O", "Õ": "O",
"Ú": "U", "Ù": "U", "Û": "U", "Ũ": "U"
})

@property
def stopwords(self) -> List[str]:
return self.config.get("stopwords", [])

@property
def articles(self) -> List[str]:
return self.config.get("articles", [])

@property
def symbols(self) -> List[str]:
return self.config.get("symbols",
[";", "_", "!", "?", "<", ">", "|",
"(", ")", "=", "[", "]", "{", "}",
"»", "«", "*", "~", "^", "`", "\""])

def expand_contractions(self, utterance: str) -> str:
""" Expand common contractions, e.g. "isn't" -> "is not" """
words = self.tokenize(utterance)
for idx, w in enumerate(words):
if w in self.contractions:
words[idx] = self.contractions[w]
utterance = " ".join(words)
return utterance

def numbers_to_digits(self, utterance: str) -> str:
words = self.tokenize(utterance)
for idx, w in enumerate(words):
if w in self.number_replacements:
words[idx] = self.number_replacements[w]
utterance = " ".join(words)
return utterance

def remove_articles(self, utterance: str) -> str:
words = self.tokenize(utterance)
for idx, w in enumerate(words):
if w in self.articles:
words[idx] = ""
utterance = " ".join(words)
return utterance

def remove_stopwords(self, utterance: str) -> str:
words = self.tokenize(utterance)
for idx, w in enumerate(words):
if w in self.stopwords:
words[idx] = ""
# if words[-1] == '-':
# words = words[:-1]
utterance = " ".join(words)
# Remove trailing whitespaces from utterance along with orphaned
# hyphens, more characters may be added later
utterance = re.sub(r'- *$', '', utterance)
return utterance

def remove_symbols(self, utterance: str) -> str:
mapping = str.maketrans('', '', "".join(self.symbols))
return utterance.translate(mapping)

def remove_accents(self, utterance : str) -> str:
for s in self.accents:
utterance = utterance.replace(s, self.accents[s])
return utterance

def replace_words(self, utterance: str) -> str:
words = self.tokenize(utterance)
for idx, w in enumerate(words):
if w in self.word_replacements:
words[idx] = self.word_replacements[w]
utterance = " ".join(words)
return utterance

def normalize(self, utterance: str = ""):
# mutations
if self.should_lowercase:
utterance = utterance.lower()
if self.should_expand_contractions:
utterance = self.expand_contractions(utterance)
if self.should_numbers_to_digits:
utterance = self.numbers_to_digits(utterance)
utterance = self.replace_words(utterance)

# removals
if self.should_remove_symbols:
utterance = self.remove_symbols(utterance)
if self.should_remove_accents:
utterance = self.remove_accents(utterance)
if self.should_remove_articles:
utterance = self.remove_articles(utterance)
if self.should_remove_stopwords:
utterance = self.remove_stopwords(utterance)
# remove extra spaces
utterance = " ".join([w for w in utterance.split(" ") if w])
return utterance


class CatalanNormalizer(Normalizer):
with open(f"{dirname(dirname(__file__))}/res/ca/normalize.json") as f:
_default_config = json.load(f)

@staticmethod
def tokenize(utterance : str) -> List[str]:
return word_tokenize(utterance, lang="ca")


class CzechNormalizer(Normalizer):
with open(f"{dirname(dirname(__file__))}/res/cz/normalize.json", encoding='utf8') as f:
_default_config = json.load(f)


class PortugueseNormalizer(Normalizer):
with open(f"{dirname(dirname(__file__))}/res/pt/normalize.json") as f:
_default_config = json.load(f)

@staticmethod
def tokenize(utterance: str) -> List[str]:
return word_tokenize(utterance, lang="pt")


class RussianNormalizer(Normalizer):
with open(f"{dirname(dirname(__file__))}/res/ru/normalize.json", encoding='utf8') as f:
_default_config = json.load(f)


class UkrainianNormalizer(Normalizer):
with open(f"{dirname(dirname(__file__))}/res/uk/normalize.json", encoding='utf8') as f:
_default_config = json.load(f)


class EnglishNormalizer(Normalizer):
with open(f"{dirname(dirname(__file__))}/res/en/normalize.json") as f:
_default_config = json.load(f)

def numbers_to_digits(self, utterance: str) -> str:
return EnglishNumberParser().convert_words_to_numbers(utterance)


class AzerbaijaniNormalizer(Normalizer):
with open(f"{dirname(dirname(__file__))}/res/az/normalize.json") as f:
_default_config = json.load(f)

def numbers_to_digits(self, utterance: str) -> str:
return AzerbaijaniNumberParser().convert_words_to_numbers(utterance)


class GermanNormalizer(Normalizer):
with open(f"{dirname(dirname(__file__))}/res/de/normalize.json") as f:
_default_config = json.load(f)

def numbers_to_digits(self, utterance: str) -> str:
return GermanNumberParser().convert_words_to_numbers(utterance)

def remove_symbols(self, utterance: str) -> str:
# special rule for hyphanated words in german as some STT engines falsely
# return them pretty regularly
utterance = re.sub(r'\b(\w*)-(\w*)\b', r'\1 \2', utterance)
return super().remove_symbols(utterance)
Loading
Loading