diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..2921037 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +include requirements.txt +include LICENSE +recursive-include ovos_utterance_normalizer * \ No newline at end of file diff --git a/ovos_utterance_normalizer/__init__.py b/ovos_utterance_normalizer/__init__.py new file mode 100644 index 0000000..aabb1cf --- /dev/null +++ b/ovos_utterance_normalizer/__init__.py @@ -0,0 +1,61 @@ +import string +from typing import Optional, List +from ovos_utterance_normalizer.normalizer import Normalizer, CatalanNormalizer, CzechNormalizer, \ + PortugueseNormalizer, AzerbaijaniNormalizer, RussianNormalizer, EnglishNormalizer, UkrainianNormalizer, \ + GermanNormalizer +from ovos_plugin_manager.templates.transformers import UtteranceTransformer + + +class UtteranceNormalizerPlugin(UtteranceTransformer): + """plugin to normalize utterances by normalizing numbers, punctuation and contractions + language specific pre-processing is handled here too + this helps intent parsers""" + + def __init__(self, name="ovos-utterance-normalizer", priority=1): + super().__init__(name, priority) + + @staticmethod + def get_normalizer(lang: str): + if lang.startswith("en"): + return EnglishNormalizer() + elif lang.startswith("pt"): + return PortugueseNormalizer() + elif lang.startswith("uk"): + return UkrainianNormalizer() + elif lang.startswith("ca"): + return CatalanNormalizer() + elif lang.startswith("cz"): + return CzechNormalizer() + elif lang.startswith("az"): + return AzerbaijaniNormalizer() + elif lang.startswith("ru"): + return RussianNormalizer() + elif lang.startswith("de"): + return GermanNormalizer() + return Normalizer() + + @staticmethod + def strip_punctuation(utterance: str): + return utterance.strip(string.punctuation).strip() + + def transform(self, utterances: List[str], + context: Optional[dict] = None) -> (list, dict): + context = context or {} + lang = context.get("lang") or self.config.get("lang", "en-us") + normalizer = self.get_normalizer(lang) + + norm = [] + # 1 - expand contractions + # 2 - original utterance + # 3 - normalized utterance + for u in utterances: + norm.append(normalizer.expand_contractions(u)) + norm.append(u) + norm.append(normalizer.normalize(u)) + + if self.config.get("strip_punctuation", True): + norm = [self.strip_punctuation(u) for u in norm] + + # this deduplicates the list while keeping order + return list(dict.fromkeys(norm)), context + diff --git a/ovos_utterance_normalizer/normalizer.py b/ovos_utterance_normalizer/normalizer.py new file mode 100644 index 0000000..751a5ca --- /dev/null +++ b/ovos_utterance_normalizer/normalizer.py @@ -0,0 +1,235 @@ +import json +import re +from os.path import dirname +from typing import List, Dict + +from ovos_utterance_normalizer.tokenization import word_tokenize +from ovos_utterance_normalizer.numeric import EnglishNumberParser, AzerbaijaniNumberParser, GermanNumberParser + + +class Normalizer: + # taken from lingua_franca + """ + individual languages may subclass this if needed + + normalize_XX should pass a valid config read from json + """ + _default_config = {} + + def __init__(self, config=None): + self.config = config or self._default_config + + @staticmethod + def tokenize(utterance) -> List[str]: + return word_tokenize(utterance) + + @property + def should_lowercase(self) -> bool: + return self.config.get("lowercase", False) + + @property + def should_numbers_to_digits(self) -> bool: + return self.config.get("numbers_to_digits", True) + + @property + def should_expand_contractions(self) -> bool: + return self.config.get("expand_contractions", True) + + @property + def should_remove_symbols(self) -> bool: + return self.config.get("remove_symbols", False) + + @property + def should_remove_accents(self) -> bool: + return self.config.get("remove_accents", False) + + @property + def should_remove_articles(self) -> bool: + return self.config.get("remove_articles", False) + + @property + def should_remove_stopwords(self) -> bool: + return self.config.get("remove_stopwords", False) + + @property + def contractions(self) -> Dict[str, str]: + return self.config.get("contractions", {}) + + @property + def word_replacements(self) -> Dict[str, str]: + return self.config.get("word_replacements", {}) + + @property + def number_replacements(self) -> Dict[str, str]: + return self.config.get("number_replacements", {}) + + @property + def accents(self) -> Dict[str, str]: + return self.config.get("accents", + {"á": "a", "à": "a", "ã": "a", "â": "a", + "é": "e", "è": "e", "ê": "e", "ẽ": "e", + "í": "i", "ì": "i", "î": "i", "ĩ": "i", + "ò": "o", "ó": "o", "ô": "o", "õ": "o", + "ú": "u", "ù": "u", "û": "u", "ũ": "u", + "Á": "A", "À": "A", "Ã": "A", "Â": "A", + "É": "E", "È": "E", "Ê": "E", "Ẽ": "E", + "Í": "I", "Ì": "I", "Î": "I", "Ĩ": "I", + "Ò": "O", "Ó": "O", "Ô": "O", "Õ": "O", + "Ú": "U", "Ù": "U", "Û": "U", "Ũ": "U" + }) + + @property + def stopwords(self) -> List[str]: + return self.config.get("stopwords", []) + + @property + def articles(self) -> List[str]: + return self.config.get("articles", []) + + @property + def symbols(self) -> List[str]: + return self.config.get("symbols", + [";", "_", "!", "?", "<", ">", "|", + "(", ")", "=", "[", "]", "{", "}", + "»", "«", "*", "~", "^", "`", "\""]) + + def expand_contractions(self, utterance: str) -> str: + """ Expand common contractions, e.g. "isn't" -> "is not" """ + words = self.tokenize(utterance) + for idx, w in enumerate(words): + if w in self.contractions: + words[idx] = self.contractions[w] + utterance = " ".join(words) + return utterance + + def numbers_to_digits(self, utterance: str) -> str: + words = self.tokenize(utterance) + for idx, w in enumerate(words): + if w in self.number_replacements: + words[idx] = self.number_replacements[w] + utterance = " ".join(words) + return utterance + + def remove_articles(self, utterance: str) -> str: + words = self.tokenize(utterance) + for idx, w in enumerate(words): + if w in self.articles: + words[idx] = "" + utterance = " ".join(words) + return utterance + + def remove_stopwords(self, utterance: str) -> str: + words = self.tokenize(utterance) + for idx, w in enumerate(words): + if w in self.stopwords: + words[idx] = "" + # if words[-1] == '-': + # words = words[:-1] + utterance = " ".join(words) + # Remove trailing whitespaces from utterance along with orphaned + # hyphens, more characters may be added later + utterance = re.sub(r'- *$', '', utterance) + return utterance + + def remove_symbols(self, utterance: str) -> str: + mapping = str.maketrans('', '', "".join(self.symbols)) + return utterance.translate(mapping) + + def remove_accents(self, utterance : str) -> str: + for s in self.accents: + utterance = utterance.replace(s, self.accents[s]) + return utterance + + def replace_words(self, utterance: str) -> str: + words = self.tokenize(utterance) + for idx, w in enumerate(words): + if w in self.word_replacements: + words[idx] = self.word_replacements[w] + utterance = " ".join(words) + return utterance + + def normalize(self, utterance: str = ""): + # mutations + if self.should_lowercase: + utterance = utterance.lower() + if self.should_expand_contractions: + utterance = self.expand_contractions(utterance) + if self.should_numbers_to_digits: + utterance = self.numbers_to_digits(utterance) + utterance = self.replace_words(utterance) + + # removals + if self.should_remove_symbols: + utterance = self.remove_symbols(utterance) + if self.should_remove_accents: + utterance = self.remove_accents(utterance) + if self.should_remove_articles: + utterance = self.remove_articles(utterance) + if self.should_remove_stopwords: + utterance = self.remove_stopwords(utterance) + # remove extra spaces + utterance = " ".join([w for w in utterance.split(" ") if w]) + return utterance + + +class CatalanNormalizer(Normalizer): + with open(f"{dirname(dirname(__file__))}/res/ca/normalize.json") as f: + _default_config = json.load(f) + + @staticmethod + def tokenize(utterance : str) -> List[str]: + return word_tokenize(utterance, lang="ca") + + +class CzechNormalizer(Normalizer): + with open(f"{dirname(dirname(__file__))}/res/cz/normalize.json", encoding='utf8') as f: + _default_config = json.load(f) + + +class PortugueseNormalizer(Normalizer): + with open(f"{dirname(dirname(__file__))}/res/pt/normalize.json") as f: + _default_config = json.load(f) + + @staticmethod + def tokenize(utterance: str) -> List[str]: + return word_tokenize(utterance, lang="pt") + + +class RussianNormalizer(Normalizer): + with open(f"{dirname(dirname(__file__))}/res/ru/normalize.json", encoding='utf8') as f: + _default_config = json.load(f) + + +class UkrainianNormalizer(Normalizer): + with open(f"{dirname(dirname(__file__))}/res/uk/normalize.json", encoding='utf8') as f: + _default_config = json.load(f) + + +class EnglishNormalizer(Normalizer): + with open(f"{dirname(dirname(__file__))}/res/en/normalize.json") as f: + _default_config = json.load(f) + + def numbers_to_digits(self, utterance: str) -> str: + return EnglishNumberParser().convert_words_to_numbers(utterance) + + +class AzerbaijaniNormalizer(Normalizer): + with open(f"{dirname(dirname(__file__))}/res/az/normalize.json") as f: + _default_config = json.load(f) + + def numbers_to_digits(self, utterance: str) -> str: + return AzerbaijaniNumberParser().convert_words_to_numbers(utterance) + + +class GermanNormalizer(Normalizer): + with open(f"{dirname(dirname(__file__))}/res/de/normalize.json") as f: + _default_config = json.load(f) + + def numbers_to_digits(self, utterance: str) -> str: + return GermanNumberParser().convert_words_to_numbers(utterance) + + def remove_symbols(self, utterance: str) -> str: + # special rule for hyphanated words in german as some STT engines falsely + # return them pretty regularly + utterance = re.sub(r'\b(\w*)-(\w*)\b', r'\1 \2', utterance) + return super().remove_symbols(utterance) diff --git a/ovos_utterance_normalizer/numeric.py b/ovos_utterance_normalizer/numeric.py new file mode 100644 index 0000000..2a40ed4 --- /dev/null +++ b/ovos_utterance_normalizer/numeric.py @@ -0,0 +1,2105 @@ +from collections import OrderedDict +from typing import List + +from ovos_utils.json_helper import invert_dict +from ovos_utterance_normalizer.tokenization import word_tokenize, partition_list, \ + Token, ReplaceableNumber + + +def is_numeric(word): + """ + Takes in a string and tests to see if it is a number. + Args: + word (str): string to test if a number + Returns: + (bool): True if a number, else False + + """ + try: + float(word) + return True + except ValueError: + return False + + +def look_for_fractions(split_list): + """" + This function takes a list made by fraction & determines if a fraction. + + Args: + split_list (list): list created by splitting on '/' + Returns: + (bool): False if not a fraction, otherwise True + + """ + + if len(split_list) == 2: + if is_numeric(split_list[0]) and is_numeric(split_list[1]): + return True + + return False + + +class GermanNumberParser: + # taken from lingua_franca + _ARTICLES_DE = {'der', 'das', 'die', 'dem', 'den'} + + #_SPOKEN_NUMBER + _NUM_STRING_DE = { + 0: 'null', + 1: 'eins', + 2: 'zwei', + 3: 'drei', + 4: 'vier', + 5: 'fünf', + 6: 'sechs', + 7: 'sieben', + 8: 'acht', + 9: 'neun', + 10: 'zehn', + 11: 'elf', + 12: 'zwölf', + 13: 'dreizehn', + 14: 'vierzehn', + 15: 'fünfzehn', + 16: 'sechzehn', + 17: 'siebzehn', + 18: 'achtzehn', + 19: 'neunzehn', + 20: 'zwanzig', + 30: 'dreißig', + 40: 'vierzig', + 50: 'fünfzig', + 60: 'sechzig', + 70: 'siebzig', + 80: 'achtzig', + 90: 'neunzig', + 100: 'hundert', + 200: 'zweihundert', + 300: 'dreihundert', + 400: 'vierhundert', + 500: 'fünfhundert', + 600: 'sechshundert', + 700: 'siebenhundert', + 800: 'achthundert', + 900: 'neunhundert', + 1000: 'tausend', + 1000000: 'million' + } + + _STRING_NUM_DE = invert_dict(_NUM_STRING_DE) + _STRING_NUM_DE.update({ + 'ein': 1, + 'eine': 1, + 'einer': 1, + 'einem': 1, + 'einen': 1 + }) + + _MONTHS_DE = ['januar', 'februar', 'märz', 'april', 'mai', 'juni', + 'juli', 'august', 'september', 'oktober', 'november', + 'dezember'] + + # German uses "long scale" https://en.wikipedia.org/wiki/Long_and_short_scales + # Currently, numbers are limited to 1000000000000000000000000, + # but _NUM_POWERS_OF_TEN can be extended to include additional number words + + + _NUM_POWERS_OF_TEN_DE = [ + '', 'tausend', 'Million', 'Milliarde', 'Billion', 'Billiarde', 'Trillion', + 'Trilliarde' + ] + + _FRACTION_STRING_DE = { + 2: 'halb', + 3: 'drittel', + 4: 'viertel', + 5: 'fünftel', + 6: 'sechstel', + 7: 'siebtel', + 8: 'achtel', + 9: 'neuntel', + 10: 'zehntel', + 11: 'elftel', + 12: 'zwölftel', + 13: 'dreizehntel', + 14: 'vierzehntel', + 15: 'fünfzehntel', + 16: 'sechzehntel', + 17: 'siebzehntel', + 18: 'achtzehntel', + 19: 'neunzehntel', + 20: 'zwanzigstel' + } + + _STRING_FRACTION_DE = invert_dict(_FRACTION_STRING_DE) + _STRING_FRACTION_DE.update({ + 'halb': 2, + 'halbe': 2, + 'halben': 2, + 'halbes': 2, + 'halber': 2, + 'halbem': 2 + }) + + # Numbers below 1 million are written in one word in German, yielding very + # long words + # In some circumstances it may better to seperate individual words + # Set _EXTRA_SPACE_DA=" " for separating numbers below 1 million ( + # orthographically incorrect) + _EXTRA_SPACE_DE = "" + + _ORDINAL_BASE_DE = { + "1.": "erst", + "2.": "zweit", + "3.": "dritt", + "4.": "viert", + "5.": "fünft", + "6.": "sechst", + "7.": "siebt", + "8.": "acht", + "9.": "neunt", + "10.": "zehnt", + "11.": "elft", + "12.": "zwölft", + "13.": "dreizehnt", + "14.": "vierzehnt", + "15.": "fünfzehnt", + "16.": "sechzehnt", + "17.": "siebzehnt", + "18.": "achtzehnt", + "19.": "neunzehnt", + "20.": "zwanzigst", + "21.": "einundzwanzigst", + "22.": "zweiundzwanzigst", + "23.": "dreiundzwanzigst", + "24.": "vierundzwanzigst", + "25.": "fünfundzwanzigst", + "26.": "sechsundzwanzigst", + "27.": "siebenundzwanzigst", + "28.": "achtundzwanzigst", + "29.": "neunundzwanzigst", + "30.": "dreißigst", + "31.": "einunddreißigst", + "32.": "zweiunddreißigst", + "33.": "dreiunddreißigst", + "34.": "vierunddreißigst", + "35.": "fünfunddreißigst", + "36.": "sechsunddreißigst", + "37.": "siebenunddreißigst", + "38.": "achtunddreißigst", + "39.": "neununddreißigst", + "40.": "vierzigst", + "41.": "einundvierzigst", + "42.": "zweiundvierzigst", + "43.": "dreiundvierzigst", + "44.": "vierundvierzigst", + "45.": "fünfundvierzigst", + "46.": "sechsundvierzigst", + "47.": "siebenundvierzigst", + "48.": "achtundvierzigst", + "49.": "neunundvierzigst", + "50.": "fünfzigst", + "51.": "einundfünfzigst", + "52.": "zweiundfünfzigst", + "53.": "dreiundfünfzigst", + "60.": "sechzigst", + "70.": "siebzigst", + "80.": "achtzigst", + "90.": "neunzigst", + "100.": "einhundertst", + "1000.": "eintausendst", + "1000000.": "millionst" + } + + _LONG_SCALE_DE = OrderedDict([ + (100, 'hundert'), + (1000, 'tausend'), + (1000000, 'million'), + (1e9, "milliarde"), + (1e12, 'billion'), + (1e15, "billiarde"), + (1e18, "trillion"), + (1e21, "trilliarde"), + (1e24, "quadrillion"), + (1e27, "quadrilliarde") + ]) + + _MULTIPLIER_DE = set(_LONG_SCALE_DE.values()) + + _STRING_LONG_SCALE_DE = invert_dict(_LONG_SCALE_DE) + + # ending manipulation + for number, item in _LONG_SCALE_DE.items(): + if int(number) > 1000: + if item.endswith('e'): + name = item + 'n' + _MULTIPLIER_DE.add(name) + _STRING_LONG_SCALE_DE[name] = number + else: + name = item + 'en' + _MULTIPLIER_DE.add(name) + _STRING_LONG_SCALE_DE[name] = number + + _LONG_ORDINAL_DE = { + 1e6: "millionst", + 1e9: "milliardst", + 1e12: "billionst", + 1e15: "billiardst", + 1e18: "trillionst", + 1e21: "trilliardst", + 1e24: "quadrillionst", + 1e27: "quadrilliardst" + } + + _LONG_ORDINAL_DE.update(_ORDINAL_BASE_DE) + + # dict für erste, drittem, millionstes ... + _STRING_LONG_ORDINAL_DE = {ord+ending: num for ord, num in invert_dict(_LONG_ORDINAL_DE).items() + for ending in ("en", "em", "es", "er", "e")} + _FRACTION_MARKER_DE = set() + _NEGATIVES_DE = {"minus"} + _NUMBER_CONNECTORS_DE = {"und"} + _COMMA_DE = {"komma", "comma", "punkt"} + + + def is_ordinal_de(self, input_str): + """ + This function takes the given text and checks if it is an ordinal number. + Args: + input_str (str): the string to check if ordinal + Returns: + (bool) or (float): False if not an ordinal, otherwise the number + corresponding to the ordinal + ordinals for 1, 3, 7 and 8 are irregular + only works for ordinals corresponding to the numbers in _STRING_NUM + """ + val = self._STRING_LONG_ORDINAL_DE.get(input_str.lower(), False) + # account for numbered ordinals + if not val and input_str.endswith('.') and is_numeric(input_str[:-1]): + val = input_str + return val + + def is_fractional_de(self, input_str, short_scale=False): + """ + This function takes the given text and checks if it is a fraction. + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + """ + # account for different numerators, e.g. zweidrittel + + input_str = input_str.lower() + numerator = 1 + prev_number = 0 + denominator = False + remainder = "" + + # first check if is a fraction containing a char (eg "2/3") + _bucket = input_str.split('/') + if look_for_fractions(_bucket): + numerator = float(_bucket[0]) + denominator = float(_bucket[1]) + + if not denominator: + for fraction in sorted(self._STRING_FRACTION_DE.keys(), + key=lambda x: len(x), + reverse=True): + if fraction in input_str and not denominator: + denominator = self._STRING_FRACTION_DE.get(fraction) + remainder = input_str.replace(fraction, "") + break + + if remainder: + if not self._STRING_NUM_DE.get(remainder, False): + #acount for eineindrittel + for numstring, number in self._STRING_NUM_DE.items(): + if remainder.endswith(numstring): + prev_number = self._STRING_NUM_DE.get( + remainder.replace(numstring, "", 1), 0) + numerator = number + break + else: + return False + else: + numerator = self._STRING_NUM_DE.get(remainder) + + if denominator: + return prev_number + (numerator / denominator) + else: + return False + + def is_number_de(self, word: str): + if self.is_ordinal_de(word): + return None + + if is_numeric(word): + if word.isdigit(): + return int(word) + else: + return float(word) + elif word in self._STRING_NUM_DE: + return self._STRING_NUM_DE.get(word) + elif word in self._STRING_LONG_SCALE_DE: + return self._STRING_LONG_SCALE_DE.get(word) + + return None + + def convert_words_to_numbers(self, utterance, short_scale=False, + ordinals=False, fractions=True): + """ + Convert words in a string into their equivalent numbers. + Args: + text str: + short_scale boolean: True if short scale numberres should be used. + ordinals boolean: True if ordinals (e.g. first, second, third) should + be parsed to their number values (1, 2, 3...) + Returns: + str + The original text, with numbers subbed in where appropriate. + """ + tokens = [Token(word, index) for index, word in enumerate(word_tokenize(utterance))] + numbers_to_replace = self.extract_numbers(tokens, short_scale, ordinals, fractions) + + results = [] + for token in tokens: + if not numbers_to_replace or \ + token.index < numbers_to_replace[0].start_index: + results.append(token.word) + else: + if numbers_to_replace and \ + token.index == numbers_to_replace[0].start_index: + results.append(str(numbers_to_replace[0].value)) + if numbers_to_replace and \ + token.index == numbers_to_replace[0].end_index: + numbers_to_replace.pop(0) + + return ' '.join(results) + + + def extract_numbers(self, tokens: list, + short_scale: bool = False, + ordinals: bool = False, + fractions: bool = True) -> List: + """ + extract numeric values from a list of tokens. + Args: + tokens (list): list of tokens (str) + short_scale boolean: True if short scale numbers should be used. + ordinals boolean: True if ordinals (e.g. first, second, third) should + be parsed to their number values (1, 2, 3...) + Returns: + list of extraced numbers (ReplaceableNumber) + + """ + if not isinstance(tokens[0], Token): # list of string tokens + tokens = [Token(word, index) for index, word in enumerate(tokens)] + numbers_to_replace = self._extract_numbers_with_text_de(tokens, short_scale, ordinals, fractions) + numbers_to_replace.sort(key=lambda number: number.start_index) + return numbers_to_replace + + def _extract_numbers_with_text_de(self, tokens, short_scale=True, + ordinals=False, fractions=True): + """ + Extract all numbers from a list of Tokens, with the words that + represent them. + + Args: + [Token]: The tokens to parse. + short_scale bool: True if short scale numbers should be used, False for + long scale. True by default. + ordinals bool: True if ordinal words (first, second, third, etc) should + be parsed. + fractional_numbers bool: True if we should look for fractions and + decimals. + + Returns: + [ReplaceableNumber]: A list of tuples, each containing a number and a + string. + + """ + placeholder = "" # inserted to maintain correct indices + results = [] + while True: + to_replace = \ + self._extract_number_with_text_de(tokens, short_scale, + ordinals) + + if not to_replace: + break + + if isinstance(to_replace.value, float) and not fractions: + pass + else: + results.append(to_replace) + + tokens = [ + t if not + to_replace.start_index <= t.index <= to_replace.end_index + else + Token(placeholder, t.index) for t in tokens + ] + results.sort(key=lambda n: n.start_index) + return results + + + def _extract_number_with_text_de(self, tokens, short_scale=True, + ordinals=False): + """ + This function extracts a number from a list of Tokens. + + Args: + tokens str: the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers + fractional_numbers (bool): True if we should look for fractions and + decimals. + Returns: + ReplaceableNumber + + """ + number, tokens = \ + self._extract_number_with_text_de_helper(tokens, short_scale, + ordinals) + return ReplaceableNumber(number, tokens) + + + def _extract_number_with_text_de_helper(self, tokens, + short_scale, ordinals): + """ + Helper for _extract_number_with_text_de. + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + fractional_numbers boolean: + Returns: + int or float, [Tokens] + """ + if ordinals: + for token in tokens: + ordinal = self.is_ordinal_de(token.word) + if ordinal: + return ordinal, [token] + + return self._extract_real_number_with_text_de(tokens, short_scale) + + + def _extract_real_number_with_text_de(self, tokens, short_scale): + """ + This is handling real numbers. + + Args: + tokens [Token]: + short_scale boolean: + Returns: + int or float, [Tokens] + The value parsed, and tokens that it corresponds to. + """ + number_words = [] + val = _val = _current_val = None + _comma = False + to_sum = [] + + for idx, token in enumerate(tokens): + + _prev_val = _current_val + _current_val = None + + word = token.word + + if word in self._NUMBER_CONNECTORS_DE and not number_words: + continue + if word in (self._NEGATIVES_DE | + self._NUMBER_CONNECTORS_DE | + self._COMMA_DE): + number_words.append(token) + if word in self._COMMA_DE: + _comma = token + _current_val = _val or _prev_val + continue + + prev_word = tokens[idx - 1].word if idx > 0 else "" + next_word = tokens[idx + 1].word if idx + 1 < len(tokens) else "" + + if word not in self._STRING_LONG_SCALE_DE and \ + word not in self._STRING_NUM_DE and \ + word not in self._MULTIPLIER_DE and \ + not is_numeric(word) and \ + not self.is_fractional_de(word): + words_only = [token.word for token in number_words] + if _val is not None: + to_sum.append(_val) + if to_sum: + val = sum(to_sum) + + if number_words and (not all([w in self._ARTICLES_DE | + self._NEGATIVES_DE| + self._NUMBER_CONNECTORS_DE + for w in words_only]) + or str(val) == number_words[-1].word): + break + else: + number_words.clear() + to_sum.clear() + val = _val = _prev_val = None + continue + elif word not in self._MULTIPLIER_DE \ + and prev_word not in self._MULTIPLIER_DE \ + and prev_word not in self._NUMBER_CONNECTORS_DE \ + and prev_word not in self._NEGATIVES_DE \ + and prev_word not in self._COMMA_DE \ + and prev_word not in self._STRING_LONG_SCALE_DE \ + and prev_word not in self._STRING_NUM_DE \ + and not self.is_ordinal_de(word) \ + and not is_numeric(prev_word) \ + and not self.is_fractional_de(prev_word): + number_words = [token] + else: + number_words.append(token) + + # is this word already a number or a word of a number? + _val = _current_val = self.is_number_de(word) + + # is this a negative number? + if _current_val is not None and prev_word in self._NEGATIVES_DE: + _val = 0 - _current_val + + # is the prev word a number and should we multiply it? + if _prev_val is not None and ( word in self._MULTIPLIER_DE or \ + word in ("einer", "eines", "einem")): + to_sum.append(_prev_val * _current_val or _current_val) + _val = _current_val = None + + # fraction handling + _fraction_val = self.is_fractional_de(word, short_scale=short_scale) + if _fraction_val: + if _prev_val is not None and prev_word != "eine" and \ + word not in self._STRING_FRACTION_DE: # zusammengesetzter Bruch + _val = _prev_val + _fraction_val + if prev_word not in self._NUMBER_CONNECTORS_DE \ + and tokens[idx -1] not in number_words: + number_words.append(tokens[idx - 1]) + elif _prev_val is not None: + _val = _prev_val * _fraction_val + if tokens[idx -1] not in number_words: + number_words.append(tokens[idx - 1]) + else: + _val = _fraction_val + _current_val = _val + + # directly following numbers without relation + if (is_numeric(prev_word) or prev_word in self._STRING_NUM_DE) \ + and not _fraction_val \ + and not self.is_fractional_de(next_word) \ + and not to_sum: + val = _prev_val + number_words.pop(-1) + break + + # is this a spoken time ("drei viertel acht") + if isinstance(_prev_val, float) and self.is_number_de(word) and not to_sum: + if idx+1 < len(tokens): + _, number = self._extract_real_number_with_text_de([tokens[idx + 1]], + short_scale=short_scale) + if not next_word or not number: + val = f"{_val-1}:{int(60*_prev_val)}" + break + + # spoken decimals + if _current_val is not None and _comma: + # to_sum = [ 1, 0.2, 0.04,...] + to_sum.append(_current_val if _current_val >= 10 else ( + _current_val) / (10 ** (token.index - _comma.index))) + _val = _current_val = None + + if _current_val is not None and \ + next_word in (self._NUMBER_CONNECTORS_DE | self._COMMA_DE | {""}): + to_sum.append(_val or _current_val) + _val = _current_val = None + + if not next_word and number_words: + val = sum(to_sum) or _val + + return val, number_words + + +# TODO - finish adding public user facing methods +class EnglishNumberParser: + # taken from lingua_franca + + # TODO - from json file + _ARTICLES_EN = {'a', 'an', 'the'} + _NUM_STRING_EN = { + 0: 'zero', + 1: 'one', + 2: 'two', + 3: 'three', + 4: 'four', + 5: 'five', + 6: 'six', + 7: 'seven', + 8: 'eight', + 9: 'nine', + 10: 'ten', + 11: 'eleven', + 12: 'twelve', + 13: 'thirteen', + 14: 'fourteen', + 15: 'fifteen', + 16: 'sixteen', + 17: 'seventeen', + 18: 'eighteen', + 19: 'nineteen', + 20: 'twenty', + 30: 'thirty', + 40: 'forty', + 50: 'fifty', + 60: 'sixty', + 70: 'seventy', + 80: 'eighty', + 90: 'ninety' + } + _FRACTION_STRING_EN = { + 2: 'half', + 3: 'third', + 4: 'forth', + 5: 'fifth', + 6: 'sixth', + 7: 'seventh', + 8: 'eigth', + 9: 'ninth', + 10: 'tenth', + 11: 'eleventh', + 12: 'twelveth', + 13: 'thirteenth', + 14: 'fourteenth', + 15: 'fifteenth', + 16: 'sixteenth', + 17: 'seventeenth', + 18: 'eighteenth', + 19: 'nineteenth', + 20: 'twentyith' + } + _LONG_SCALE_EN = OrderedDict([ + (100, 'hundred'), + (1000, 'thousand'), + (1000000, 'million'), + (1e12, "billion"), + (1e18, 'trillion'), + (1e24, "quadrillion"), + (1e30, "quintillion"), + (1e36, "sextillion"), + (1e42, "septillion"), + (1e48, "octillion"), + (1e54, "nonillion"), + (1e60, "decillion"), + (1e66, "undecillion"), + (1e72, "duodecillion"), + (1e78, "tredecillion"), + (1e84, "quattuordecillion"), + (1e90, "quinquadecillion"), + (1e96, "sedecillion"), + (1e102, "septendecillion"), + (1e108, "octodecillion"), + (1e114, "novendecillion"), + (1e120, "vigintillion"), + (1e306, "unquinquagintillion"), + (1e312, "duoquinquagintillion"), + (1e336, "sesquinquagintillion"), + (1e366, "unsexagintillion") + ]) + _SHORT_SCALE_EN = OrderedDict([ + (100, 'hundred'), + (1000, 'thousand'), + (1000000, 'million'), + (1e9, "billion"), + (1e12, 'trillion'), + (1e15, "quadrillion"), + (1e18, "quintillion"), + (1e21, "sextillion"), + (1e24, "septillion"), + (1e27, "octillion"), + (1e30, "nonillion"), + (1e33, "decillion"), + (1e36, "undecillion"), + (1e39, "duodecillion"), + (1e42, "tredecillion"), + (1e45, "quattuordecillion"), + (1e48, "quinquadecillion"), + (1e51, "sedecillion"), + (1e54, "septendecillion"), + (1e57, "octodecillion"), + (1e60, "novendecillion"), + (1e63, "vigintillion"), + (1e66, "unvigintillion"), + (1e69, "uuovigintillion"), + (1e72, "tresvigintillion"), + (1e75, "quattuorvigintillion"), + (1e78, "quinquavigintillion"), + (1e81, "qesvigintillion"), + (1e84, "septemvigintillion"), + (1e87, "octovigintillion"), + (1e90, "novemvigintillion"), + (1e93, "trigintillion"), + (1e96, "untrigintillion"), + (1e99, "duotrigintillion"), + (1e102, "trestrigintillion"), + (1e105, "quattuortrigintillion"), + (1e108, "quinquatrigintillion"), + (1e111, "sestrigintillion"), + (1e114, "septentrigintillion"), + (1e117, "octotrigintillion"), + (1e120, "noventrigintillion"), + (1e123, "quadragintillion"), + (1e153, "quinquagintillion"), + (1e183, "sexagintillion"), + (1e213, "septuagintillion"), + (1e243, "octogintillion"), + (1e273, "nonagintillion"), + (1e303, "centillion"), + (1e306, "uncentillion"), + (1e309, "duocentillion"), + (1e312, "trescentillion"), + (1e333, "decicentillion"), + (1e336, "undecicentillion"), + (1e363, "viginticentillion"), + (1e366, "unviginticentillion"), + (1e393, "trigintacentillion"), + (1e423, "quadragintacentillion"), + (1e453, "quinquagintacentillion"), + (1e483, "sexagintacentillion"), + (1e513, "septuagintacentillion"), + (1e543, "ctogintacentillion"), + (1e573, "nonagintacentillion"), + (1e603, "ducentillion"), + (1e903, "trecentillion"), + (1e1203, "quadringentillion"), + (1e1503, "quingentillion"), + (1e1803, "sescentillion"), + (1e2103, "septingentillion"), + (1e2403, "octingentillion"), + (1e2703, "nongentillion"), + (1e3003, "millinillion") + ]) + _ORDINAL_BASE_EN = { + 1: 'first', + 2: 'second', + 3: 'third', + 4: 'fourth', + 5: 'fifth', + 6: 'sixth', + 7: 'seventh', + 8: 'eighth', + 9: 'ninth', + 10: 'tenth', + 11: 'eleventh', + 12: 'twelfth', + 13: 'thirteenth', + 14: 'fourteenth', + 15: 'fifteenth', + 16: 'sixteenth', + 17: 'seventeenth', + 18: 'eighteenth', + 19: 'nineteenth', + 20: 'twentieth', + 30: 'thirtieth', + 40: "fortieth", + 50: "fiftieth", + 60: "sixtieth", + 70: "seventieth", + 80: "eightieth", + 90: "ninetieth", + 1e2: "hundredth", + 1e3: "thousandth" + } + _SHORT_ORDINAL_EN = { + 1e6: "millionth", + 1e9: "billionth", + 1e12: "trillionth", + 1e15: "quadrillionth", + 1e18: "quintillionth", + 1e21: "sextillionth", + 1e24: "septillionth", + 1e27: "octillionth", + 1e30: "nonillionth", + 1e33: "decillionth" + # TODO > 1e-33 + } + _SHORT_ORDINAL_EN.update(_ORDINAL_BASE_EN) + _LONG_ORDINAL_EN = { + 1e6: "millionth", + 1e12: "billionth", + 1e18: "trillionth", + 1e24: "quadrillionth", + 1e30: "quintillionth", + 1e36: "sextillionth", + 1e42: "septillionth", + 1e48: "octillionth", + 1e54: "nonillionth", + 1e60: "decillionth" + # TODO > 1e60 + } + _LONG_ORDINAL_EN.update(_ORDINAL_BASE_EN) + # negate next number (-2 = 0 - 2) + _NEGATIVES_EN = {"negative", "minus"} + # sum the next number (twenty two = 20 + 2) + _SUMS_EN = {'twenty', '20', 'thirty', '30', 'forty', '40', 'fifty', '50', + 'sixty', '60', 'seventy', '70', 'eighty', '80', 'ninety', '90'} + _MULTIPLIES_LONG_SCALE_EN = set(_LONG_SCALE_EN.values()) | \ + {value + "s" for value in _LONG_SCALE_EN.values()} + _MULTIPLIES_SHORT_SCALE_EN = set(_SHORT_SCALE_EN.values()) | \ + {value + "s" for value in _SHORT_SCALE_EN.values()} + # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) + _FRACTION_MARKER_EN = {"and"} + # decimal marker ( 1 point 5 = 1 + 0.5) + _DECIMAL_MARKER_EN = {"point", "dot"} + _STRING_NUM_EN = {v: k for k, v in _NUM_STRING_EN.items()} + _STRING_NUM_EN.update({key + 's': value for key, value in _STRING_NUM_EN.items()}) + _SPOKEN_EXTRA_NUM_EN = { + "half": 0.5, + "halves": 0.5, + "couple": 2 + } + _STRING_SHORT_ORDINAL_EN = {v: k for k, v in _SHORT_ORDINAL_EN.items()} + _STRING_LONG_ORDINAL_EN = {v: k for k, v in _LONG_ORDINAL_EN.items()} + + def is_fractional(self, input_str, short_scale=True, spoken=True): + """ + This function takes the given text and checks if it is a fraction. + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + spoken (bool): consider "half", "quarter", "whole" a fraction + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + if input_str.endswith('s', -1): + input_str = input_str[:len(input_str) - 1] # e.g. "fifths" + + fracts = {"whole": 1, "half": 2, "halve": 2, "quarter": 4} + if short_scale: + for num in self._SHORT_ORDINAL_EN: + if num > 2: + fracts[self._SHORT_ORDINAL_EN[num]] = num + else: + for num in self._LONG_ORDINAL_EN: + if num > 2: + fracts[self._LONG_ORDINAL_EN[num]] = num + + if input_str.lower() in fracts and spoken: + return 1.0 / fracts[input_str.lower()] + return False + + def convert_words_to_numbers(self, utterance, short_scale=True, ordinals=False): + """ + Convert words in a string into their equivalent numbers. + Args: + text str: + short_scale boolean: True if short scale numbers should be used. + ordinals boolean: True if ordinals (e.g. first, second, third) should + be parsed to their number values (1, 2, 3...) + + Returns: + str + The original text, with numbers subbed in where appropriate. + + """ + tokens = [Token(word, index) for index, word in enumerate(word_tokenize(utterance))] + numbers_to_replace = self.extract_numbers(tokens, short_scale, ordinals) + + results = [] + for token in tokens: + if not numbers_to_replace or \ + token.index < numbers_to_replace[0].start_index: + results.append(token.word) + else: + if numbers_to_replace and \ + token.index == numbers_to_replace[0].start_index: + results.append(str(numbers_to_replace[0].value)) + if numbers_to_replace and \ + token.index == numbers_to_replace[0].end_index: + numbers_to_replace.pop(0) + + return ' '.join(results) + + def extract_numbers(self, tokens: list, short_scale: bool=True, ordinals: bool=False) -> List: + """ + extract numeric values from a list of tokens. + Args: + tokens (list): list of tokens (str) + short_scale boolean: True if short scale numbers should be used. + ordinals boolean: True if ordinals (e.g. first, second, third) should + be parsed to their number values (1, 2, 3...) + Returns: + list of extraced numbers (ReplaceableNumber) + + """ + if not isinstance(tokens[0], Token): # list of string tokens + tokens = [Token(word, index) for index, word in enumerate(tokens)] + numbers_to_replace = self._extract_numbers_with_text_en(tokens, short_scale, ordinals) + numbers_to_replace.sort(key=lambda number: number.start_index) + return numbers_to_replace + + # helper methods + def _initialize_number_data_en(self, short_scale, speech=True): + """ + Generate dictionaries of words to numbers, based on scale. + + This is a helper function for _extract_whole_number. + + Args: + short_scale (bool): + speech (bool): consider extra words (_SPOKEN_EXTRA_NUM_EN) to be numbers + + Returns: + (set(str), dict(str, number), dict(str, number)) + multiplies, string_num_ordinal, string_num_scale + + """ + multiplies = self._MULTIPLIES_SHORT_SCALE_EN if short_scale \ + else self._MULTIPLIES_LONG_SCALE_EN + + string_num_ordinal_en = self._STRING_SHORT_ORDINAL_EN if short_scale \ + else self._STRING_LONG_ORDINAL_EN + + string_num_scale_en = self._SHORT_SCALE_EN if short_scale else self._LONG_SCALE_EN + string_num_scale_en = {v: k for k, v in string_num_scale_en.items()} + string_num_scale_en.update({key + 's': value for key, value in string_num_scale_en.items()}) + + if speech: + string_num_scale_en.update(self._SPOKEN_EXTRA_NUM_EN) + return multiplies, string_num_ordinal_en, string_num_scale_en + + def _extract_fraction_with_text_en(self, tokens, short_scale, ordinals): + """ + Extract fraction numbers from a string. + + This function handles text such as '2 and 3/4'. Note that "one half" or + similar will be parsed by the whole number function. + + Args: + tokens [Token]: words and their indexes in the original string. + short_scale boolean: + ordinals boolean: + + Returns: + (int or float, [Token]) + The value found, and the list of relevant tokens. + (None, None) if no fraction value is found. + + """ + for c in self._FRACTION_MARKER_EN: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + self._extract_numbers_with_text_en(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + self._extract_numbers_with_text_en(partitions[2], short_scale, + ordinals, fractional_numbers=True) + + if not numbers1 or not numbers2: + return None, None + + # ensure first is not a fraction and second is a fraction + num1 = numbers1[-1] + num2 = numbers2[0] + if num1.value >= 1 and 0 < num2.value < 1: + return num1.value + num2.value, \ + num1.tokens + partitions[1] + num2.tokens + + return None, None + + def _extract_decimal_with_text_en(self, tokens, short_scale, ordinals): + """ + Extract decimal numbers from a string. + + This function handles text such as '2 point 5'. + + Notes: + While this is a helper for extractnumber_en, it also depends on + extractnumber_en, to parse out the components of the decimal. + + This does not currently handle things like: + number dot number number number + + Args: + tokens [Token]: The text to parse. + short_scale boolean: + ordinals boolean: + + Returns: + (float, [Token]) + The value found and relevant tokens. + (None, None) if no decimal value is found. + + """ + for c in self._DECIMAL_MARKER_EN: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + self._extract_numbers_with_text_en(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + self._extract_numbers_with_text_en(partitions[2], short_scale, + ordinals, fractional_numbers=False) + + if not numbers1 or not numbers2: + return None, None + + number = numbers1[-1] + decimal = numbers2[0] + + # TODO handle number dot number number number + if "." not in str(decimal.text): + return number.value + float('0.' + str(decimal.value)), \ + number.tokens + partitions[1] + decimal.tokens + return None, None + + def _extract_whole_number_with_text_en(self, tokens, short_scale, ordinals): + """ + Handle numbers not handled by the decimal or fraction functions. This is + generally whole numbers. Note that phrases such as "one half" will be + handled by this function, while "one and a half" are handled by the + fraction function. + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + + Returns: + int or float, [Tokens] + The value parsed, and tokens that it corresponds to. + + """ + multiplies, string_num_ordinal, string_num_scale = \ + self._initialize_number_data_en(short_scale, speech=ordinals is not None) + + number_words = [] # type: List[Token] + val = False + prev_val = None + next_val = None + to_sum = [] + for idx, token in enumerate(tokens): + current_val = None + if next_val: + next_val = None + continue + + word = token.word.lower() + if word in self._ARTICLES_EN or word in self._NEGATIVES_EN: + number_words.append(token) + continue + + prev_word = tokens[idx - 1].word.lower() if idx > 0 else "" + next_word = tokens[idx + 1].word.lower() if idx + 1 < len(tokens) else "" + + if is_numeric(word[:-2]) and \ + (word.endswith("st") or word.endswith("nd") or + word.endswith("rd") or word.endswith("th")): + + # explicit ordinals, 1st, 2nd, 3rd, 4th.... Nth + word = word[:-2] + + # handle nth one + if next_word == "one": + # would return 1 instead otherwise + tokens[idx + 1] = Token("", idx) + next_word = "" + + # TODO replaces the wall of "and" and "or" with all() or any() as + # appropriate, the whole codebase should be checked for this pattern + if word not in string_num_scale and \ + word not in self._STRING_NUM_EN and \ + word not in self._SUMS_EN and \ + word not in multiplies and \ + not (ordinals and word in string_num_ordinal) and \ + not is_numeric(word) and \ + not self.is_fractional(word, short_scale=short_scale) and \ + not look_for_fractions(word.split('/')): + words_only = [token.word for token in number_words] + + if number_words and not all([w.lower() in self._ARTICLES_EN | + self._NEGATIVES_EN for w in words_only]): + break + else: + number_words = [] + continue + elif word not in multiplies \ + and prev_word not in multiplies \ + and prev_word not in self._SUMS_EN \ + and not (ordinals and prev_word in string_num_ordinal) \ + and prev_word not in self._NEGATIVES_EN \ + and prev_word not in self._ARTICLES_EN: + number_words = [token] + + elif prev_word in self._SUMS_EN and word in self._SUMS_EN: + number_words = [token] + elif ordinals is None and \ + (word in string_num_ordinal or word in self._SPOKEN_EXTRA_NUM_EN): + # flagged to ignore this token + continue + else: + number_words.append(token) + + # is this word already a number ? + if is_numeric(word): + if word.isdigit(): # doesn't work with decimals + val = int(word) + else: + val = float(word) + current_val = val + + # is this word the name of a number ? + if word in self._STRING_NUM_EN: + val = self._STRING_NUM_EN.get(word) + current_val = val + elif word in string_num_scale: + val = string_num_scale.get(word) + current_val = val + elif ordinals and word in string_num_ordinal: + val = string_num_ordinal[word] + current_val = val + + # is the prev word an ordinal number and current word is one? + # second one, third one + if ordinals and prev_word in string_num_ordinal and val == 1: + val = prev_val + + # is the prev word a number and should we sum it? + # twenty two, fifty six + if (prev_word in self._SUMS_EN and val and val < 10) or all([prev_word in + multiplies, + val < prev_val if prev_val else False]): + val = prev_val + val + + # is the prev word a number and should we multiply it? + # twenty hundred, six hundred + if word in multiplies: + if not prev_val: + prev_val = 1 + val = prev_val * val + + # is this a spoken fraction? + # half cup + if val is False and \ + not (ordinals is None and word in string_num_ordinal): + val = self.is_fractional(word, short_scale=short_scale, + spoken=ordinals is not None) + + current_val = val + + # 2 fifths + if ordinals is False: + next_val = self.is_fractional(next_word, short_scale=short_scale) + if next_val: + if not val: + val = 1 + val = val * next_val + number_words.append(tokens[idx + 1]) + + # is this a negative number? + if val and prev_word and prev_word in self._NEGATIVES_EN: + val = 0 - val + + # let's make sure it isn't a fraction + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + current_val = val + + else: + if current_val and all([ + prev_word in self._SUMS_EN, + word not in self._SUMS_EN, + word not in multiplies, + current_val >= 10]): + # Backtrack - we've got numbers we can't sum. + number_words.pop() + val = prev_val + break + prev_val = val + + if word in multiplies and next_word not in multiplies: + # handle long numbers + # six hundred sixty six + # two million five hundred thousand + # + # This logic is somewhat complex, and warrants + # extensive documentation for the next coder's sake. + # + # The current word is a power of ten. `current_val` is + # its integer value. `val` is our working sum + # (above, when `current_val` is 1 million, `val` is + # 2 million.) + # + # We have a dict `string_num_scale` containing [value, word] + # pairs for "all" powers of ten: string_num_scale[10] == "ten. + # + # We need go over the rest of the tokens, looking for other + # powers of ten. If we find one, we compare it with the current + # value, to see if it's smaller than the current power of ten. + # + # Numbers which are not powers of ten will be passed over. + # + # If all the remaining powers of ten are smaller than our + # current value, we can set the current value aside for later, + # and begin extracting another portion of our final result. + # For example, suppose we have the following string. + # The current word is "million".`val` is 9000000. + # `current_val` is 1000000. + # + # "nine **million** nine *hundred* seven **thousand** + # six *hundred* fifty seven" + # + # Iterating over the rest of the string, the current + # value is larger than all remaining powers of ten. + # + # The if statement passes, and nine million (9000000) + # is appended to `to_sum`. + # + # The main variables are reset, and the main loop begins + # assembling another number, which will also be appended + # under the same conditions. + # + # By the end of the main loop, to_sum will be a list of each + # "place" from 100 up: [9000000, 907000, 600] + # + # The final three digits will be added to the sum of that list + # at the end of the main loop, to produce the extracted number: + # + # sum([9000000, 907000, 600]) + 57 + # == 9,000,000 + 907,000 + 600 + 57 + # == 9,907,657 + # + # >>> foo = "nine million nine hundred seven thousand six + # hundred fifty seven" + # >>> extract_number(foo) + # 9907657 + + time_to_sum = True + for other_token in tokens[idx + 1:]: + if other_token.word.lower() in multiplies: + if string_num_scale[other_token.word.lower()] >= current_val: + time_to_sum = False + else: + continue + if not time_to_sum: + break + if time_to_sum: + to_sum.append(val) + val = 0 + prev_val = 0 + + if val is not None and to_sum: + val += sum(to_sum) + + return val, number_words + + def _extract_number_with_text_en_helper(self, tokens, + short_scale=True, ordinals=False, + fractional_numbers=True): + """ + Helper for _extract_number_with_text_en. + + This contains the real logic for parsing, but produces + a result that needs a little cleaning (specific, it may + contain leading articles that can be trimmed off). + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + fractional_numbers boolean: + + Returns: + int or float, [Tokens] + + """ + if fractional_numbers: + fraction, fraction_text = \ + self._extract_fraction_with_text_en(tokens, short_scale, ordinals) + if fraction: + return fraction, fraction_text + + decimal, decimal_text = \ + self._extract_decimal_with_text_en(tokens, short_scale, ordinals) + if decimal: + return decimal, decimal_text + + return self._extract_whole_number_with_text_en(tokens, short_scale, ordinals) + + def _extract_number_with_text_en(self, tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + This function extracts a number from a list of Tokens. + + Args: + tokens str: the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + fractional_numbers (bool): True if we should look for fractions and + decimals. + Returns: + ReplaceableNumber + + """ + number, tokens = \ + self._extract_number_with_text_en_helper(tokens, short_scale, + ordinals, fractional_numbers) + while tokens and tokens[0].word in self._ARTICLES_EN: + tokens.pop(0) + return ReplaceableNumber(number, tokens) + + def _extract_numbers_with_text_en(self, tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + Extract all numbers from a list of Tokens, with the words that + represent them. + + Args: + [Token]: The tokens to parse. + short_scale bool: True if short scale numbers should be used, False for + long scale. True by default. + ordinals bool: True if ordinal words (first, second, third, etc) should + be parsed. + fractional_numbers bool: True if we should look for fractions and + decimals. + + Returns: + [ReplaceableNumber]: A list of tuples, each containing a number and a + string. + + """ + placeholder = "" # inserted to maintain correct indices + results = [] + while True: + to_replace = \ + self._extract_number_with_text_en(tokens, short_scale, + ordinals, fractional_numbers) + + if not to_replace: + break + + results.append(to_replace) + + tokens = [ + t if not + to_replace.start_index <= t.index <= to_replace.end_index + else + Token(placeholder, t.index) for t in tokens + ] + results.sort(key=lambda n: n.start_index) + return results + + +class AzerbaijaniNumberParser: + # taken from lingua_franca + + # TODO - from json file + _NUM_STRING_AZ = { + 0: 'sıfır', + 1: 'bir', + 2: 'iki', + 3: 'üç', + 4: 'dörd', + 5: 'beş', + 6: 'altı', + 7: 'yeddi', + 8: 'səkkiz', + 9: 'doqquz', + 10: 'on', + 11: 'on bir', + 12: 'on iki', + 13: 'on üç', + 14: 'on dörd', + 15: 'on beş', + 16: 'on altı', + 17: 'on yeddi', + 18: 'on səkkiz', + 19: 'on doqquz', + 20: 'iyirmi', + 30: 'otuz', + 40: 'qırx', + 50: 'əlli', + 60: 'altmış', + 70: 'yetmiş', + 80: 'səksən', + 90: 'doxsan' + } + _FRACTION_STRING_AZ = { + 2: 'ikidə', + 3: 'üçdə', + 4: 'dörddə', + 5: 'beşdə', + 6: 'altıda', + 7: 'yeddidə', + 8: 'səkkizdə', + 9: 'doqquzda', + 10: 'onda', + 11: 'on birdə', + 12: 'on ikidə', + 13: 'on üçdə', + 14: 'on dörddə', + 15: 'on beşdə', + 16: 'on altıda', + 17: 'on yeddidə', + 18: 'on səkkizdə', + 19: 'on doqquzda', + 20: 'iyirmidə', + 30: 'otuzda', + 40: 'qırxda', + 50: 'əllidə', + 60: 'altmışda', + 70: 'yetmişdə', + 80: 'səksəndə', + 90: 'doxsanda', + 1e2: 'yüzdə', + 1e3: 'mində' + } + _LONG_SCALE_AZ = OrderedDict([ + (100, 'yüz'), + (1000, 'min'), + (1000000, 'milyon'), + (1e12, "milyard"), + (1e18, 'trilyon'), + (1e24, "kvadrilyon"), + (1e30, "kvintilyon"), + (1e36, "sekstilyon"), + (1e42, "septilyon"), + (1e48, "oktilyon"), + (1e54, "nonilyon"), + (1e60, "dekilyon") + ]) + _SHORT_SCALE_AZ = OrderedDict([ + (100, 'yüz'), + (1000, 'min'), + (1000000, 'milyon'), + (1e9, "milyard"), + (1e12, 'trilyon'), + (1e15, "kvadrilyon"), + (1e18, "kvintilyon"), + (1e21, "sekstilyon"), + (1e24, "septilyon"), + (1e27, "oktilyon"), + (1e30, "nonilyon"), + (1e33, "dekilyon") + ]) + _ORDINAL_BASE_AZ = { + 1: 'birinci', + 2: 'ikinci', + 3: 'üçüncü', + 4: 'dördüncü', + 5: 'beşinci', + 6: 'altıncı', + 7: 'yeddinci', + 8: 'səkkizinci', + 9: 'doqquzuncu', + 10: 'onuncu', + 11: 'on birinci', + 12: 'on ikinci', + 13: 'on üçüncü', + 14: 'on dördüncü', + 15: 'on beşinci', + 16: 'on altıncı', + 17: 'on yeddinci', + 18: 'on səkkizinci', + 19: 'on doqquzuncu', + 20: 'iyirminci', + 30: 'otuzuncu', + 40: "qırxıncı", + 50: "əllinci", + 60: "altmışıncı", + 70: "yetmışinci", + 80: "səksəninci", + 90: "doxsanınçı", + 1e2: "yüzüncü", + 1e3: "mininci" + } + _SHORT_ORDINAL_AZ = { + 1e6: "milyonuncu", + 1e9: "milyardıncı", + 1e12: "trilyonuncu", + 1e15: "kvadrilyonuncu", + 1e18: "kvintilyonuncu", + 1e21: "sekstilyonuncu", + 1e24: "septilyonuncu", + 1e27: "oktilyonuncu", + 1e30: "nonilyonuncu", + 1e33: "dekilyonuncu" + # TODO > 1e-33 + } + _SHORT_ORDINAL_AZ.update(_ORDINAL_BASE_AZ) + _LONG_ORDINAL_AZ = { + 1e6: "milyonuncu", + 1e12: "milyardıncı", + 1e18: "trilyonuncu", + 1e24: "kvadrilyonuncu", + 1e30: "kvintilyonuncu", + 1e36: "sekstilyonuncu", + 1e42: "septilyonuncu", + 1e48: "oktilyonuncu", + 1e54: "nonilyonuncu", + 1e60: "dekilyonuncu" + # TODO > 1e60 + } + _LONG_ORDINAL_AZ.update(_ORDINAL_BASE_AZ) + # negate next number (-2 = 0 - 2) + _NEGATIVES_AZ = {"mənfi", "minus"} + # sum the next number (iyirmi iki = 20 + 2) + _SUMS_AZ = {'on', '10', 'iyirmi', '20', 'otuz', '30', 'qırx', '40', 'əlli', '50', + 'altmış', '60', 'yetmiş', '70', 'səksən', '80', 'doxsan', '90'} + _MULTIPLIES_LONG_SCALE_AZ = set(_LONG_SCALE_AZ.values()) | \ + set(_LONG_SCALE_AZ.values()) + _MULTIPLIES_SHORT_SCALE_AZ = set(_SHORT_SCALE_AZ.values()) | \ + set(_SHORT_SCALE_AZ.values()) + # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) + _FRACTION_MARKER_AZ = {"və"} + # decimal marker ( 1 nöqtə 5 = 1 + 0.5) + _DECIMAL_MARKER_AZ = {"nöqtə"} + _STRING_NUM_AZ = {v: k for k, v in _NUM_STRING_AZ.items()} + _SPOKEN_EXTRA_NUM_AZ = { + "yarım": 0.5, + "üçdəbir": 1 / 3, + "dörddəbir": 1 / 4 + } + _STRING_SHORT_ORDINAL_AZ = {v: k for k, v in _SHORT_ORDINAL_AZ.items()} + _STRING_LONG_ORDINAL_AZ = {v: k for k, v in _LONG_ORDINAL_AZ.items()} + + def convert_words_to_numbers(self, text, short_scale=True, ordinals=False): + """ + Convert words in a string into their equivalent numbers. + Args: + text str: + short_scale boolean: True if short scale numbers should be used. + ordinals boolean: True if ordinals (e.g. birinci, ikinci, üçüncü) should + be parsed to their number values (1, 2, 3...) + + Returns: + str + The original text, with numbers subbed in where appropriate. + + """ + tokens = [Token(word, index) for index, word in enumerate(word_tokenize(text))] + numbers_to_replace = self.extract_numbers_az(tokens, short_scale, ordinals) + results = [] + for token in tokens: + if not numbers_to_replace or \ + token.index < numbers_to_replace[0].start_index: + results.append(token.word) + else: + if numbers_to_replace and \ + token.index == numbers_to_replace[0].start_index: + results.append(str(numbers_to_replace[0].value)) + if numbers_to_replace and \ + token.index == numbers_to_replace[0].end_index: + numbers_to_replace.pop(0) + + return ' '.join(results) + + def extract_numbers(self, tokens: list, short_scale: bool=False, ordinals: bool=False) -> List: + """ + extract numeric values from a list of tokens. + Args: + tokens (list): list of tokens (str) + short_scale boolean: True if short scale numbers should be used. + ordinals boolean: True if ordinals (e.g. first, second, third) should + be parsed to their number values (1, 2, 3...) + Returns: + list of extraced numbers (ReplaceableNumber) + + """ + if not isinstance(tokens[0], Token): # list of string tokens + tokens = [Token(word, index) for index, word in enumerate(tokens)] + numbers_to_replace = self._extract_numbers_with_text_az(tokens, short_scale, ordinals) + numbers_to_replace.sort(key=lambda number: number.start_index) + return numbers_to_replace + + def is_fractional(self, input_str, short_scale=True, spoken=True): + """ + This function takes the given text and checks if it is a fraction. + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + spoken (bool): + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + + fracts = {"dörddəbir": 4, "yarım": 2, "üçdəbir": 3} + for num in self._FRACTION_STRING_AZ: + if num > 2: + fracts[self._FRACTION_STRING_AZ[num]] = num + + if input_str.lower() in fracts and spoken: + return 1.0 / fracts[input_str.lower()] + return False + + # helper methods + + def _extract_numbers_with_text_az(self, tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + Extract all numbers from a list of Tokens, with the words that + represent them. + + Args: + [Token]: The tokens to parse. + short_scale bool: True if short scale numbers should be used, False for + long scale. True by default. + ordinals bool: True if ordinal words (birinci, ikinci, üçüncü, etc) should + be parsed. + fractional_numbers bool: True if we should look for fractions and + decimals. + + Returns: + [ReplaceableNumber]: A list of tuples, each containing a number and a + string. + + """ + placeholder = "" # inserted to maintain correct indices + results = [] + while True: + to_replace = \ + self._extract_number_with_text_az(tokens, short_scale, + ordinals, fractional_numbers) + if not to_replace: + break + + results.append(to_replace) + + tokens = [ + t if not + to_replace.start_index <= t.index <= to_replace.end_index + else + Token(placeholder, t.index) for t in tokens + ] + results.sort(key=lambda n: n.start_index) + return results + + def _extract_number_with_text_az(self, tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + This function extracts a number from a list of Tokens. + + Args: + tokens str: the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers + fractional_numbers (bool): True if we should look for fractions and + decimals. + Returns: + ReplaceableNumber + + """ + number, tokens = \ + self._extract_number_with_text_az_helper(tokens, short_scale, + ordinals, fractional_numbers) + return ReplaceableNumber(number, tokens) + + def _extract_number_with_text_az_helper(self, tokens, + short_scale=True, ordinals=False, + fractional_numbers=True): + """ + Helper for _extract_number_with_text_az. + + This contains the real logic for parsing, but produces + a result that needs a little cleaning (specific, it may + contain leading articles that can be trimmed off). + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + fractional_numbers boolean: + + Returns: + int or float, [Tokens] + + """ + if fractional_numbers: + fraction, fraction_text = \ + self._extract_fraction_with_text_az(tokens, short_scale, ordinals) + if fraction: + # print("fraction") + return fraction, fraction_text + + decimal, decimal_text = \ + self._extract_decimal_with_text_az(tokens, short_scale, ordinals) + if decimal: + # print("decimal") + return decimal, decimal_text + + return self._extract_whole_number_with_text_az(tokens, short_scale, ordinals) + + def _extract_fraction_with_text_az(self, tokens, short_scale, ordinals): + """ + Extract fraction numbers from a string. + + This function handles text such as '2 və dörddə üç'. Note that "yarım" or + similar will be parsed by the whole number function. + + Args: + tokens [Token]: words and their indexes in the original string. + short_scale boolean: + ordinals boolean: + + Returns: + (int or float, [Token]) + The value found, and the list of relevant tokens. + (None, None) if no fraction value is found. + + """ + for c in self._FRACTION_MARKER_AZ: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + self._extract_numbers_with_text_az(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + self._extract_numbers_with_text_az(partitions[2], short_scale, + ordinals, fractional_numbers=True) + + if not numbers1 or not numbers2: + return None, None + + # ensure first is not a fraction and second is a fraction + num1 = numbers1[-1] + num2 = numbers2[0] + if num1.value >= 1 and 0 < num2.value < 1: + return num1.value + num2.value, \ + num1.tokens + partitions[1] + num2.tokens + + return None, None + + def _extract_decimal_with_text_az(self, tokens, short_scale, ordinals): + """ + Extract decimal numbers from a string. + + This function handles text such as '2 nöqtə 5'. + + Notes: + While this is a helper for extractnumber_az, it also depends on + extractnumber_az, to parse out the components of the decimal. + + This does not currently handle things like: + number dot number number number + + Args: + tokens [Token]: The text to parse. + short_scale boolean: + ordinals boolean: + + Returns: + (float, [Token]) + The value found and relevant tokens. + (None, None) if no decimal value is found. + + """ + for c in self._DECIMAL_MARKER_AZ: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + self._extract_numbers_with_text_az(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + self._extract_numbers_with_text_az(partitions[2], short_scale, + ordinals, fractional_numbers=False) + if not numbers1 or not numbers2: + return None, None + + number = numbers1[-1] + decimal = numbers2[0] + + # TODO handle number dot number number number + if "." not in str(decimal.text): + return number.value + float('0.' + str(decimal.value)), \ + number.tokens + partitions[1] + decimal.tokens + return None, None + + def _extract_whole_number_with_text_az(self, tokens, short_scale, ordinals): + """ + Handle numbers not handled by the decimal or fraction functions. This is + generally whole numbers. Note that phrases such as "yarım" will be + handled by this function. + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + + Returns: + int or float, [Tokens] + The value parsed, and tokens that it corresponds to. + + """ + multiplies, string_num_ordinal, string_num_scale = \ + self._initialize_number_data_az(short_scale, speech=ordinals is not None) + + number_words = [] # type: List[Token] + val = False + prev_val = None + next_val = None + to_sum = [] + # print(tokens, ordinals) + for idx, token in enumerate(tokens): + current_val = None + if next_val: + next_val = None + continue + + word = token.word.lower() + if word in self._NEGATIVES_AZ: + number_words.append(token) + continue + + prev_word = tokens[idx - 1].word.lower() if idx > 0 else "" + next_word = tokens[idx + 1].word.lower() if idx + 1 < len(tokens) else "" + # print(prev_word, word, next_word, number_words) + if word not in string_num_scale and \ + word not in self._STRING_NUM_AZ and \ + word not in self._SUMS_AZ and \ + word not in multiplies and \ + not (ordinals and word in string_num_ordinal) and \ + not is_numeric(word) and \ + not self.is_fractional(word, short_scale=short_scale) and \ + not look_for_fractions(word.split('/')): + # print("a1") + words_only = [token.word for token in number_words] + + if number_words and not all([w.lower() in + self._NEGATIVES_AZ for w in words_only]): + break + else: + number_words = [] + continue + elif word not in multiplies \ + and word not in self._SPOKEN_EXTRA_NUM_AZ \ + and prev_word not in multiplies \ + and prev_word not in self._SUMS_AZ \ + and not (ordinals and prev_word in string_num_ordinal) \ + and prev_word not in self._NEGATIVES_AZ: + number_words = [token] + # print("a2") + elif prev_word in self._SUMS_AZ and word in self._SUMS_AZ: + number_words = [token] + # print("a3") + elif ordinals is None and \ + (word in string_num_ordinal or word in self._SPOKEN_EXTRA_NUM_AZ): + # print("a4") + # flagged to ignore this token + continue + else: + # print("a5") + number_words.append(token) + + # is this word already a number ? + if is_numeric(word): + # print("b") + if word.isdigit(): # doesn't work with decimals + val = int(word) + else: + val = float(word) + current_val = val + + # is this word the name of a number ? + if word in self._STRING_NUM_AZ: + val = self._STRING_NUM_AZ.get(word) + current_val = val + # print("c1", current_val) + elif word in string_num_scale: + val = string_num_scale.get(word) + current_val = val + # print("c2") + elif ordinals and word in string_num_ordinal: + val = string_num_ordinal[word] + current_val = val + # print("c3") + # is the prev word a number and should we sum it? + # twenty two, fifty six + if (prev_word in self._SUMS_AZ and val and val < 10) or all([prev_word in + multiplies, + val < prev_val if prev_val else False]): + val = prev_val + val + # print("d") + + # is the prev word a number and should we multiply it? + # twenty hundred, six hundred + if word in multiplies: + if not prev_val: + prev_val = 1 + val = prev_val * val + # print("e") + + # is this a spoken fraction? + # 1 yarım fincan - yarım fincan + if current_val is None and not (ordinals is None and word in self._SPOKEN_EXTRA_NUM_AZ): + val = self.is_fractional(word, short_scale=short_scale, + spoken=ordinals is not None) + if val: + if prev_val: + val += prev_val + current_val = val + # print("f", current_val, prev_val) + if word in self._SPOKEN_EXTRA_NUM_AZ: + break + + # dörddə bir + if ordinals is False: + temp = prev_val + prev_val = self.is_fractional(prev_word, short_scale=short_scale) + if prev_val: + if not val: + val = 1 + val = val * prev_val + if idx + 1 < len(tokens): + number_words.append(tokens[idx + 1]) + else: + prev_val = temp + # print("g", prev_val) + + # is this a negative number? + if val and prev_word and prev_word in self._NEGATIVES_AZ: + val = 0 - val + # print("h") + + # let's make sure it isn't a fraction + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + current_val = val + # print("i") + + else: + if current_val and all([ + prev_word in self._SUMS_AZ, + word not in self._SUMS_AZ, + word not in multiplies, + current_val >= 10]): + # Backtrack - we've got numbers we can't sum. + # print("j", number_words, prev_val) + number_words.pop() + val = prev_val + break + prev_val = val + + if word in multiplies and next_word not in multiplies: + # handle long numbers + # six hundred sixty six + # two million five hundred thousand + # + # This logic is somewhat complex, and warrants + # extensive documentation for the next coder's sake. + # + # The current word is a power of ten. `current_val` is + # its integer value. `val` is our working sum + # (above, when `current_val` is 1 million, `val` is + # 2 million.) + # + # We have a dict `string_num_scale` containing [value, word] + # pairs for "all" powers of ten: string_num_scale[10] == "ten. + # + # We need go over the rest of the tokens, looking for other + # powers of ten. If we find one, we compare it with the current + # value, to see if it's smaller than the current power of ten. + # + # Numbers which are not powers of ten will be passed over. + # + # If all the remaining powers of ten are smaller than our + # current value, we can set the current value aside for later, + # and begin extracting another portion of our final result. + # For example, suppose we have the following string. + # The current word is "million".`val` is 9000000. + # `current_val` is 1000000. + # + # "nine **million** nine *hundred* seven **thousand** + # six *hundred* fifty seven" + # + # Iterating over the rest of the string, the current + # value is larger than all remaining powers of ten. + # + # The if statement passes, and nine million (9000000) + # is appended to `to_sum`. + # + # The main variables are reset, and the main loop begins + # assembling another number, which will also be appended + # under the same conditions. + # + # By the end of the main loop, to_sum will be a list of each + # "place" from 100 up: [9000000, 907000, 600] + # + # The final three digits will be added to the sum of that list + # at the end of the main loop, to produce the extracted number: + # + # sum([9000000, 907000, 600]) + 57 + # == 9,000,000 + 907,000 + 600 + 57 + # == 9,907,657 + # + # >>> foo = "nine million nine hundred seven thousand six + # hundred fifty seven" + # >>> extract_number(foo) + # 9907657 + # print("k", tokens[idx+1:]) + time_to_sum = True + for other_token in tokens[idx + 1:]: + if other_token.word.lower() in multiplies: + if string_num_scale[other_token.word.lower()] >= current_val: + time_to_sum = False + else: + continue + if not time_to_sum: + break + if time_to_sum: + # print("l") + to_sum.append(val) + val = 0 + prev_val = 0 + + if val is not None and to_sum: + # print("m", to_sum) + val += sum(to_sum) + # print(val, number_words, "end") + return val, number_words + + def _initialize_number_data_az(self, short_scale, speech=True): + """ + Generate dictionaries of words to numbers, based on scale. + + This is a helper function for _extract_whole_number. + + Args: + short_scale (bool): + speech (bool): consider extra words (_SPOKEN_EXTRA_NUM_AZ) to be numbers + + Returns: + (set(str), dict(str, number), dict(str, number)) + multiplies, string_num_ordinal, string_num_scale + + """ + multiplies = self._MULTIPLIES_SHORT_SCALE_AZ if short_scale \ + else self._MULTIPLIES_LONG_SCALE_AZ + + string_num_ordinal_az = self._STRING_SHORT_ORDINAL_AZ if short_scale \ + else self._STRING_LONG_ORDINAL_AZ + + string_num_scale_az = self._SHORT_SCALE_AZ if short_scale else self._LONG_SCALE_AZ + string_num_scale_az = {v: k for k, v in string_num_scale_az.items()} + + return multiplies, string_num_ordinal_az, string_num_scale_az diff --git a/ovos_utterance_normalizer/res/az/normalize.json b/ovos_utterance_normalizer/res/az/normalize.json new file mode 100644 index 0000000..1a7729a --- /dev/null +++ b/ovos_utterance_normalizer/res/az/normalize.json @@ -0,0 +1,45 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": {}, + "word_replacements": { + "sora": "sonra" + }, + "number_replacements": { + "sıfır": "0", + "bir": "1", + "iki": "2", + "üç": "3", + "dörd": "4", + "beş": "5", + "altı": "6", + "yeddi": "7", + "səkkiz": "8", + "doqquz": "9", + "on": "10", + "on bir": "11", + "on iki": "12", + "on üç": "13", + "on dörd": "14", + "on beş": "15", + "on altı": "16", + "on yeddi": "17", + "on səkkiz": "18", + "on doqquz": "19", + "iyirmi": "20", + "otuz": "30", + "qırx": "40", + "əlli": "50", + "altmiş": "60", + "yetmiş": "70", + "səksən": "80", + "doxsan": "90" + }, + "stopwords": [], + "articles": [] +} \ No newline at end of file diff --git a/ovos_utterance_normalizer/res/ca/normalize.json b/ovos_utterance_normalizer/res/ca/normalize.json new file mode 100644 index 0000000..76fbdc2 --- /dev/null +++ b/ovos_utterance_normalizer/res/ca/normalize.json @@ -0,0 +1,109 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": false, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": true, + "contractions": {}, + "word_replacements": {}, + "number_replacements": { + "catorze": "14", + "cent": "100", + "cents": "100", + "cinc": "5", + "cinc-centes": "500", + "cinc-cents": "500", + "cinquanta": "50", + "deu": "10", + "dinou": "19", + "setze": "16", + "disset": "17", + "dihuit": "18", + "divuit": "18", + "dos": "2", + "dos-centes": "200", + "dos-cents": "200", + "dotze": "12", + "dues": "2", + "dues-centes": "200", + "huitanta": "80", + "huit": "8", + "huit-centes": "800", + "huit-cents": "800", + "mil": "1000", + "milió": "1000000", + "nou": "9", + "nou-centes": "900", + "nou-cents": "900", + "noranta": "90", + "onze": "11", + "primer": "1", + "primera": "1", + "quaranta": "40", + "quatre": "4", + "quatre-centes": "400", + "quatre-cents": "400", + "quinze": "15", + "segon": "2", + "segona": "2", + "seixanta": "60", + "set": "7", + "set-centes": "700", + "set-cents": "700", + "setanta": "70", + "sis": "6", + "sis-centes": "600", + "sis-cents": "600", + "tercer": "3", + "trenta": "30", + "tres": "3", + "tres-centes": "300", + "tres-cents": "300", + "tretze": "13", + "u": "1", + "un": "1", + "una": "1", + "vint": "20", + "vuitanta": "80", + "vuit": "8", + "vuit-centes": "800", + "vuit-cents": "800", + "zero": "0" + }, + "stopwords": [ + "de", + "del", + "dels", + "ell", + "ella", + "ells", + "elles", + "jo", + "i", + "al", + "dins la", + "a la", + "nosaltres", + "dins el", + "para", + "aquest", + "aquesta", + "aquests", + "aquestes", + "aquell", + "aquella", + "aquells", + "aquelles", + "que" + ], + "articles": [ + "el", + "la", + "l", + "els", + "les", + "los" + ] +} diff --git a/ovos_utterance_normalizer/res/cz/normalize.json b/ovos_utterance_normalizer/res/cz/normalize.json new file mode 100644 index 0000000..c7836ee --- /dev/null +++ b/ovos_utterance_normalizer/res/cz/normalize.json @@ -0,0 +1,46 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": {}, + "word_replacements": {}, + "number_replacements": { + "nula": "0", + "jedna": "1", + "dva": "2", + "dvě": "2", + "tři": "3", + "čtyři": "4", + "pět": "5", + "šest": "6", + "sedm": "7", + "sedum": "7", + "osm": "8", + "osum": "8", + "devět": "9", + "deset": "10", + "jedenáct": "11", + "dvanáct": "12", + "třináct": "13", + "čtrnáct": "14", + "patnáct": "15", + "šestnáct": "16", + "sedmnáct": "17", + "osmnáct": "18", + "devatenáct": "19", + "dvacet": "20", + "třicet": "30", + "čtyřicet": "40", + "padesát": "50", + "šedesát": "60", + "sedmdesát": "70", + "osmdesát": "80", + "devadesát": "90" + }, + "stopwords": [], + "articles": [] +} \ No newline at end of file diff --git a/ovos_utterance_normalizer/res/de/normalize.json b/ovos_utterance_normalizer/res/de/normalize.json new file mode 100644 index 0000000..63e5204 --- /dev/null +++ b/ovos_utterance_normalizer/res/de/normalize.json @@ -0,0 +1,122 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": { + "am": "an dem", + "ans": "an das", + "aufs": "auf das", + "beim": "bei dem", + "durchs": "durch das", + "fürs": "für das", + "hinterm": "hinter dem", + "ins": "in das", + "übers": "über das", + "unters": "unter das", + "unterm": "unter dem", + "vom": "von dem", + "vors": "vor das", + "vorm": "vor dem", + "zum": "zu dem" + }, + "word_replacements": { + "mg": "milligramm", + "kg": "kilogramm", + "g": "gramm", + "nm": "nanometer", + "µm": "mikrometer", + "mm": "millimeter", + "mm^2": "quadratmillimeter", + "mm²": "quadratmillimeter", + "cm": "zentimeter", + "cm^2": "quadratzentimeter", + "cm²": "quadratzentimeter", + "cm^3": "kubikzentimeter", + "cm³": "kubikzentimeter", + "dm": "dezimeter", + "m": "meter", + "m^2": "quadratmeter", + "m²": "quadratmeter", + "m^3": "kubikmeter", + "m³": "kubikmeter", + "km": "kilometer", + "km^2": "quadratkilometer", + "km²": "quadratkilometer", + "ha": "hektar", + "w": "watt", + "j": "joule", + "kj": "kilojoule", + "k_b": "kilobyte", + "m_b": "megabyte", + "g_b": "gigabyte", + "t_b": "terabyte", + "p_b": "petabyte", + "k_w": "kilowatt", + "kb": "kilobyte", + "mb": "megabyte", + "gb": "gigabyte", + "tb": "terabyte", + "pb": "petabyte", + "kw": "kilowatt", + "m_w": "megawatt", + "g_w": "gigawatt", + "mw": "megawatt", + "gw": "gigawatt", + "°": "grad", + "°c": "grad celsius", + "°f": "grad fahrenheit" + }, + "number_replacements": { + "null": "0", + "eins": "1", + "zwei": "2", + "drei": "3", + "vier": "4", + "fünf": "5", + "sechs": "6", + "sieben": "7", + "acht": "8", + "neun": "9", + "zehn": "10", + "elf": "11", + "zwölf": "12", + "dreizehn": "13", + "vierzehn": "14", + "fünfzehn": "15", + "sechzehn": "16", + "siebzehn": "17", + "achtzehn": "18", + "neunzehn": "19", + "zwanzig": "20", + "einundzwanzig": "21", + "zweiundzwanzig": "22", + "dreiundzwanzig": "23", + "vierundzwanzig": "24", + "fünfundzwanzig": "25", + "sechsundzwanzig": "26", + "siebenundzwanzig": "27", + "achtundzwanzig": "28", + "neunundzwanzig": "29", + "dreißig": "30", + "einunddreißig": "31", + "vierzig": "40", + "fünfzig": "50", + "sechtzig": "60", + "siebzig": "70", + "achtzig": "80", + "neunzig": "90" + }, + "stopwords": [], + "articles": [ + "der", + "die", + "das", + "dem", + "den", + "des" + ] +} diff --git a/ovos_utterance_normalizer/res/en/normalize.json b/ovos_utterance_normalizer/res/en/normalize.json new file mode 100644 index 0000000..9ae7bf3 --- /dev/null +++ b/ovos_utterance_normalizer/res/en/normalize.json @@ -0,0 +1,215 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": { + "I'd": "I would", + "I'll": "I will", + "I'm": "I am", + "I've": "I have", + "ain't": "is not", + "aren't": "are not", + "can't": "can not", + "could've": "could have", + "couldn't": "could not", + "didn't": "did not", + "doesn't": "does not", + "don't": "do not", + "gonna": "going to", + "gotta": "got to", + "hadn't": "had not", + "hasn't": "has not", + "haven't": "have not", + "he'd": "he would", + "he'll": "he will", + "he's": "he is", + "how'd": "how did", + "how'll": "how will", + "how's": "how is", + "isn't": "is not", + "it'd": "it would", + "it'll": "it will", + "it's": "it is", + "might've": "might have", + "mightn't": "might not", + "must've": "must have", + "mustn't": "must not", + "needn't": "need not", + "oughtn't": "ought not", + "shan't": "shall not", + "she'd": "she would", + "she'll": "she will", + "she's": "she is", + "should've": "should have", + "shouldn't": "should not", + "somebody's": "somebody is", + "someone'd": "someone would", + "someone'll": "someone will", + "someone's": "someone is", + "that'd": "that would", + "that'll": "that will", + "that's": "that is", + "there'd": "there would", + "there're": "there are", + "there's": "there is", + "they'd": "they would", + "they'll": "they will", + "they're": "they are", + "they've": "they have", + "wasn't": "was not", + "we'd": "we would", + "we'll": "we will", + "we're": "we are", + "we've": "we have", + "weren't": "were not", + "what'd": "what did", + "what'll": "what will", + "what're": "what are", + "what's": "what is", + "what've": "what have", + "whats": "what is", + "when'd": "when did", + "when's": "when is", + "where'd": "where did", + "where's": "where is", + "where've": "where have", + "who'd": "who would", + "who'd've": "who would have", + "who'll": "who will", + "who're": "who are", + "who's": "who is", + "who've": "who have", + "why'd": "why did", + "why're": "why are", + "why's": "why is", + "won't": "will not", + "won't've": "will not have", + "would've": "would have", + "wouldn't": "would not", + "wouldn't've": "would not have", + "y'ain't": "you are not", + "y'aint": "you are not", + "y'all": "you all", + "ya'll": "you all", + "you'd": "you would", + "you'd've": "you would have", + "you'll": "you will", + "you're": "you are", + "you've": "you have", + "I'm'a": "I am going to", + "I'm'o": "I am going to", + "I'll've": "I will have", + "I'd've": "I would have", + "Whatcha": "What are you", + "amn't": "am not", + "'cause": "because", + "can't've": "cannot have", + "couldn't've": "could not have", + "daren't": "dare not", + "daresn't": "dare not", + "dasn't": "dare not", + "everyone's": "everyone is", + "gimme": "give me", + "gon't": "go not", + "hadn't've": "had not have", + "he've": "he would have", + "he'll've": "he will have", + "he'd've": "he would have", + "here's": "here is", + "how're": "how are", + "how'd'y": "how do you do", + "howd'y": "how do you do", + "howdy": "how do you do", + "'tis": "it is", + "'twas": "it was", + "it'll've": "it will have", + "it'd've": "it would have", + "kinda": "kind of", + "let's": "let us", + "ma'am": "madam", + "may've": "may have", + "mayn't": "may not", + "mightn't've": "might not have", + "mustn't've": "must not have", + "needn't've": "need not have", + "ol'": "old", + "oughtn't've": "ought not have", + "sha'n't": "shall not", + "shan't": "shall not", + "shalln't": "shall not", + "shan't've": "shall not have", + "she'd've": "she would have", + "shouldn't've": "should not have", + "so've": "so have", + "so's": "so is", + "something's": "something is", + "that're": "that are", + "that'd've": "that would have", + "there'll": "there will", + "there'd've": "there would have", + "these're": "these are", + "they'll've": "they will have", + "they'd've": "they would have", + "this's": "this is", + "this'll": "this will", + "this'd": "this would", + "those're": "those are", + "to've": "to have", + "wanna": "want to", + "we'll've": "we will have", + "we'd've": "we would have", + "what'll've": "what will have", + "when've": "when have", + "where're": "where are", + "which's": "which is", + "who'll've": "who will have", + "why've": "why have", + "will've": "will have", + "y'all're": "you all are", + "y'all've": "you all have", + "y'all'd": "you all would", + "y'all'd've": "you all would have", + "you'll've": "you will have" + }, + "word_replacements": {}, + "number_replacements": { + "zero": "0", + "one": "1", + "two": "2", + "three": "3", + "four": "4", + "five": "5", + "six": "6", + "seven": "7", + "eight": "8", + "nine": "9", + "ten": "10", + "eleven": "11", + "twelve": "12", + "thirteen": "13", + "fourteen": "14", + "fifteen": "15", + "sixteen": "16", + "seventeen": "17", + "eighteen": "18", + "nineteen": "19", + "twenty": "20", + "thirty": "30", + "forty": "40", + "fifty": "50", + "sixty": "60", + "seventy": "70", + "eighty": "80", + "ninety": "90" + }, + "stopwords": [], + "articles": [ + "the", + "a", + "an" + ] +} diff --git a/ovos_utterance_normalizer/res/es/normalize.json b/ovos_utterance_normalizer/res/es/normalize.json new file mode 100644 index 0000000..1a7c447 --- /dev/null +++ b/ovos_utterance_normalizer/res/es/normalize.json @@ -0,0 +1,14 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": {}, + "word_replacements": {}, + "number_replacements": {}, + "stopwords": [], + "articles": [] +} diff --git a/ovos_utterance_normalizer/res/fr/normalize.json b/ovos_utterance_normalizer/res/fr/normalize.json new file mode 100644 index 0000000..1a7c447 --- /dev/null +++ b/ovos_utterance_normalizer/res/fr/normalize.json @@ -0,0 +1,14 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": {}, + "word_replacements": {}, + "number_replacements": {}, + "stopwords": [], + "articles": [] +} diff --git a/ovos_utterance_normalizer/res/it/normalize.json b/ovos_utterance_normalizer/res/it/normalize.json new file mode 100644 index 0000000..1a7c447 --- /dev/null +++ b/ovos_utterance_normalizer/res/it/normalize.json @@ -0,0 +1,14 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": {}, + "word_replacements": {}, + "number_replacements": {}, + "stopwords": [], + "articles": [] +} diff --git a/ovos_utterance_normalizer/res/nl/normalize.json b/ovos_utterance_normalizer/res/nl/normalize.json new file mode 100644 index 0000000..1a7c447 --- /dev/null +++ b/ovos_utterance_normalizer/res/nl/normalize.json @@ -0,0 +1,14 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": {}, + "word_replacements": {}, + "number_replacements": {}, + "stopwords": [], + "articles": [] +} diff --git a/ovos_utterance_normalizer/res/no/normalize.json b/ovos_utterance_normalizer/res/no/normalize.json new file mode 100644 index 0000000..1a7c447 --- /dev/null +++ b/ovos_utterance_normalizer/res/no/normalize.json @@ -0,0 +1,14 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": {}, + "word_replacements": {}, + "number_replacements": {}, + "stopwords": [], + "articles": [] +} diff --git a/ovos_utterance_normalizer/res/pt/normalize.json b/ovos_utterance_normalizer/res/pt/normalize.json new file mode 100644 index 0000000..52fd4b8 --- /dev/null +++ b/ovos_utterance_normalizer/res/pt/normalize.json @@ -0,0 +1,98 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": false, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": true, + "contractions": {}, + "word_replacements": {}, + "number_replacements": { + "catorze": "14", + "cem": "100", + "cento": "100", + "cinco": "5", + "cinquenta": "50", + "dez": "10", + "dezanove": "19", + "dezasseis": "16", + "dezassete": "17", + "dezoito": "18", + "dois": "2", + "doze": "12", + "duas": "2", + "duzentas": "200", + "duzentos": "200", + "mil": "1000", + "milhão": "1000000", + "nove": "9", + "novecentas": "900", + "novecentos": "900", + "noventa": "90", + "oitenta": "80", + "oito": "8", + "oitocentas": "800", + "oitocentos": "800", + "onze": "11", + "primeiro": "1", + "quarenta": "40", + "quatro": "4", + "quatrocentas": "400", + "quatrocentos": "400", + "quinhentas": "500", + "quinhentos": "500", + "quinze": "15", + "segundo": "2", + "seis": "6", + "seiscentas": "600", + "seiscentos": "600", + "sessenta": "60", + "sete": "7", + "setecentas": "700", + "setecentos": "700", + "setenta": "70", + "terceiro": "3", + "tres": "3", + "treze": "13", + "trezentas": "300", + "trezentos": "300", + "trinta": "30", + "três": "3", + "um": "1", + "uma": "1", + "vinte": "20", + "zero": "0" + }, + "stopwords": [ + "de", + "dos", + "das", + "lhe", + "lhes", + "me", + "e", + "no", + "nas", + "na", + "nos", + "em", + "para", + "este", + "esta", + "deste", + "desta", + "neste", + "nesta", + "nesse", + "nessa", + "foi", + "que" + ], + "articles": [ + "o", + "a", + "os", + "as" + ] +} \ No newline at end of file diff --git a/ovos_utterance_normalizer/res/ru/normalize.json b/ovos_utterance_normalizer/res/ru/normalize.json new file mode 100644 index 0000000..b7322d9 --- /dev/null +++ b/ovos_utterance_normalizer/res/ru/normalize.json @@ -0,0 +1,46 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": {}, + "word_replacements": {}, + "number_replacements": { + "ноль": "0", + "нуль": "0", + "один": "1", + "одна": "1", + "два": "2", + "две": "2", + "три": "3", + "четыре": "4", + "пять": "5", + "шесть": "6", + "семь": "7", + "восемь": "8", + "девять": "9", + "десять": "10", + "одиннадцать": "11", + "двенадцать": "12", + "тринадцать": "13", + "четырнадцать": "14", + "пятнадцать": "15", + "шестнадцать": "16", + "семнадцать": "17", + "восемнадцать": "18", + "девятнадцать": "19", + "двадцать": "20", + "тридцать": "30", + "сорок": "40", + "пятьдесят": "50", + "шестьдесят": "60", + "семьдесят": "70", + "восемьдесят": "80", + "девяносто": "90" + }, + "stopwords": [], + "articles": [] +} \ No newline at end of file diff --git a/ovos_utterance_normalizer/res/sl/normalize.json b/ovos_utterance_normalizer/res/sl/normalize.json new file mode 100644 index 0000000..1a7c447 --- /dev/null +++ b/ovos_utterance_normalizer/res/sl/normalize.json @@ -0,0 +1,14 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": {}, + "word_replacements": {}, + "number_replacements": {}, + "stopwords": [], + "articles": [] +} diff --git a/ovos_utterance_normalizer/res/sv/normalize.json b/ovos_utterance_normalizer/res/sv/normalize.json new file mode 100644 index 0000000..1a7c447 --- /dev/null +++ b/ovos_utterance_normalizer/res/sv/normalize.json @@ -0,0 +1,14 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": {}, + "word_replacements": {}, + "number_replacements": {}, + "stopwords": [], + "articles": [] +} diff --git a/ovos_utterance_normalizer/res/uk/normalize.json b/ovos_utterance_normalizer/res/uk/normalize.json new file mode 100644 index 0000000..aa13d2c --- /dev/null +++ b/ovos_utterance_normalizer/res/uk/normalize.json @@ -0,0 +1,74 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": {}, + "word_replacements": {}, + "number_replacements": { + "ноль": "0", + "нуль": "0", + "нуля": "0", + "один": "1", + "одна": "1", + "одну": "1", + "одного": "1", + "пару": "2", + "пари": "2", + "пара": "2", + "два": "2", + "двох": "2", + "дві": "2", + "три": "3", + "трьох": "3", + "чотири": "4", + "п'ять": "5", + "шість": "6", + "сім": "7", + "вісім": "8", + "дев'ять": "9", + "десять": "10", + "одинадцять": "11", + "дванадцять": "12", + "тринадцять": "13", + "чотирнадцять": "14", + "п'ятнадцять": "15", + "шістнадцять": "16", + "сімнадцять": "17", + "вісімнадцять": "18", + "дев'ятнадцять": "19", + "двадцять": "20", + "тридцять": "30", + "сорок": "40", + "п'ятдесят": "50", + "шістдесят": "60", + "сімдесят": "70", + "вісімдесят": "80", + "дев'яносто": "90", + "сто": "100", + "двісті": "200", + "триста": "300", + "чотириста": "400", + "п'ятсот": "500", + "шістсот": "600", + "сімсот": "700", + "вісімсот": "800", + "дев'ятсот": "900", + "дві сотні": "200", + "три сотні": "300", + "чотири сотні": "400", + "п'ять сотень": "500", + "шість сотень": "600", + "сім сотень": "700", + "вісім сотень": "800", + "дев'ять сотень": "900", + "тисячі": "1000", + "тисяча": "1000", + "тисяч": "1000" + }, + "stopwords": [], + "articles": [] +} diff --git a/ovos_utterance_normalizer/tokenization.py b/ovos_utterance_normalizer/tokenization.py new file mode 100644 index 0000000..7a57b10 --- /dev/null +++ b/ovos_utterance_normalizer/tokenization.py @@ -0,0 +1,195 @@ +import re +from collections import namedtuple +from datetime import datetime, date, timedelta, time +from typing import List, Any +from ovos_utils import flatten_list +from quebra_frases import word_tokenize as _wtok, sentence_tokenize as _stok + +# Token is intended to be used in the number processing functions in +# this module. The parsing requires slicing and dividing of the original +# text. To ensure things parse correctly, we need to know where text came +# from in the original input, hence this nametuple. +Token = namedtuple('Token', 'word index') + + +class ReplaceableEntity: + """ + Similar to Token, this class is used in entity parsing. + + Once we've found an entity in a string, this class contains all + the info about the value, and where it came from in the original text. + In other words, it is the text, and the entity that can replace it in + the string. + """ + + def __init__(self, value: Any, tokens: List): + self.value = value + self.tokens = tokens + + @property + def type(self): + return type(self.value) + + def __bool__(self): + return bool(self.value is not None and self.value is not False) + + @property + def start_index(self): + return self.tokens[0].index + + @property + def end_index(self): + return self.tokens[-1].index + + @property + def text(self): + return ' '.join([t.word for t in self.tokens]) + + def __setattr__(self, key, value): + try: + getattr(self, key) + except AttributeError: + super().__setattr__(key, value) + else: + raise Exception("Immutable!") + + def __str__(self): + return f"({self.value}, {self.tokens})" + + def __repr__(self): + return "{n}({v}, {t})".format(n=self.__class__.__name__, v=self.value, + t=[t.word for t in self.tokens]) + + +class ReplaceableNumber(ReplaceableEntity): + """ + Similar to Token, this class is used in number parsing. + + Once we've found a number in a string, this class contains all + the info about the value, and where it came from in the original text. + In other words, it is the text, and the number that can replace it in + the string. + """ + + +class ReplaceableDate(ReplaceableEntity): + """ + Similar to Token, this class is used in date parsing. + + Once we've found a date in a string, this class contains all + the info about the value, and where it came from in the original text. + In other words, it is the text, and the date that can replace it in + the string. + """ + + def __init__(self, value: date, tokens: List): + if isinstance(value, datetime): + value = value.date() + assert isinstance(value, date) + super().__init__(value, tokens) + + +class ReplaceableTime(ReplaceableEntity): + """ + Similar to Token, this class is used in date parsing. + + Once we've found a time in a string, this class contains all + the info about the value, and where it came from in the original text. + In other words, it is the text, and the time that can replace it in + the string. + """ + + def __init__(self, value: time, tokens: List): + if isinstance(value, datetime): + value = value.time() + assert isinstance(value, time) + super().__init__(value, tokens) + + +class ReplaceableTimedelta(ReplaceableEntity): + """ + Similar to Token, this class is used in date parsing. + + Once we've found a timedelta in a string, this class contains all + the info about the value, and where it came from in the original text. + In other words, it is the text, and the duration that can replace it in + the string. + """ + + def __init__(self, value: timedelta, tokens: List): + assert isinstance(value, timedelta) + super().__init__(value, tokens) + + +def partition_list(items, split_on): + """ + Partition a list of items. + + Works similarly to str.partition + + Args: + items: + split_on callable: + Should return a boolean. Each item will be passed to + this callable in succession, and partitions will be + created any time it returns True. + + Returns: + [[any]] + + """ + splits = [] + current_split = [] + for item in items: + if split_on(item): + splits.append(current_split) + splits.append([item]) + current_split = [] + else: + current_split.append(item) + splits.append(current_split) + return list(filter(lambda x: len(x) != 0, splits)) + + +def sentence_tokenize(text): + sents = [_stok(s) for s in text.split("\n")] + return flatten_list(sents) + + +def word_tokenize(utterance, lang=None): + if lang is not None and lang.startswith("pt"): + return word_tokenize_pt(utterance) + elif lang is not None and lang.startswith("ca"): + return word_tokenize_ca(utterance) + # Split things like 12% + utterance = re.sub(r"([0-9]+)([\%])", r"\1 \2", utterance) + # Split thins like #1 + utterance = re.sub(r"(\#)([0-9]+\b)", r"\1 \2", utterance) + return _wtok(utterance) + + +def word_tokenize_pt(utterance): + # Split things like 12% + utterance = re.sub(r"([0-9]+)([\%])", r"\1 \2", utterance) + # Split things like #1 + utterance = re.sub(r"(\#)([0-9]+\b)", r"\1 \2", utterance) + # Split things like amo-te + utterance = re.sub(r"([a-zA-Z]+)(-)([a-zA-Z]+\b)", r"\1 \2 \3", + utterance) + tokens = utterance.split() + if tokens[-1] == '-': + tokens = tokens[:-1] + + return tokens + + +def word_tokenize_ca(utterance): + # Split things like 12% + utterance = re.sub(r"([0-9]+)([\%])", r"\1 \2", utterance) + # Split things like #1 + utterance = re.sub(r"(\#)([0-9]+\b)", r"\1 \2", utterance) + # Don't split at - + tokens = utterance.split() + if tokens[-1] == '-': + tokens = tokens[:-1] + return tokens \ No newline at end of file diff --git a/ovos_utterance_normalizer/version.py b/ovos_utterance_normalizer/version.py new file mode 100644 index 0000000..79d91ab --- /dev/null +++ b/ovos_utterance_normalizer/version.py @@ -0,0 +1,7 @@ +# The following lines are replaced during the release process. +# START_VERSION_BLOCK +VERSION_MAJOR = 0 +VERSION_MINOR = 0 +VERSION_BUILD = 0 +VERSION_ALPHA = 1 +# END_VERSION_BLOCK diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1275d2e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +ovos-utils +quebra-frases +ovos-plugin-manager \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..fcbeca2 --- /dev/null +++ b/setup.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +import os + +from setuptools import setup + +BASEDIR = os.path.abspath(os.path.dirname(__file__)) + + +def get_version(): + """ Find the version of the package""" + version = None + version_file = os.path.join(BASEDIR, 'ovos_utterance_normalizer', 'version.py') + major, minor, build, alpha = (None, None, None, None) + with open(version_file) as f: + for line in f: + if 'VERSION_MAJOR' in line: + major = line.split('=')[1].strip() + elif 'VERSION_MINOR' in line: + minor = line.split('=')[1].strip() + elif 'VERSION_BUILD' in line: + build = line.split('=')[1].strip() + elif 'VERSION_ALPHA' in line: + alpha = line.split('=')[1].strip() + + if ((major and minor and build and alpha) or + '# END_VERSION_BLOCK' in line): + break + version = f"{major}.{minor}.{build}" + if alpha and int(alpha) > 0: + version += f"a{alpha}" + return version + + +def package_files(directory): + paths = [] + for (path, _, filenames) in os.walk(directory): + for filename in filenames: + paths.append(os.path.join('..', path, filename)) + return paths + + +def required(requirements_file): + """ Read requirements file and remove comments and empty lines. """ + with open(os.path.join(BASEDIR, requirements_file), 'r') as f: + requirements = f.read().splitlines() + if 'MYCROFT_LOOSE_REQUIREMENTS' in os.environ: + print('USING LOOSE REQUIREMENTS!') + requirements = [r.replace('==', '>=').replace('~=', '>=') for r in requirements] + return [pkg for pkg in requirements + if pkg.strip() and not pkg.startswith("#")] + + +extra_files = package_files('ovos-utterance-normalizer/res') + + +UTTERANCE_ENTRY_POINT = ( + 'ovos-utterance-normalizer=ovos_utterance_normalizer:UtteranceNormalizerPlugin' +) + + +setup( + name='ovos-utterance-normalizer', + version=get_version(), + author='jarbasai', + author_email='jarbasai@mailfence.com', + url='https://github.com/OpenVoiceOS/ovos-utterance-normalizer', + license='apache-2.0', + packages=['ovos_utterance_normalizer'], + include_package_data=True, + package_data={"": extra_files}, + install_requires=required("requirements.txt"), + zip_safe=True, + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Developers', + 'Topic :: Text Processing :: Linguistic', + 'License :: OSI Approved :: Apache Software License', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + ], + entry_points={ + 'neon.plugin.text': UTTERANCE_ENTRY_POINT + } +)