Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Build ASR Support for Regex, Email. Enhance Number, Date Entity #475

Merged
merged 16 commits into from
Apr 20, 2022
Merged
49 changes: 0 additions & 49 deletions lib/nlp/levenshtein_distance.py

This file was deleted.

208 changes: 208 additions & 0 deletions lib/nlp/text_normalization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
import regex as re
import string
from six.moves import range

from chatbot_ner.config import ner_logger

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

F401 'chatbot_ner.config.ner_logger' imported but unused

from ner_v1.detectors.pattern.regex.data.character_constants import CHARACTER_CONSTANTS
from ner_v2.detectors.numeral.constant import NUMBER_DETECTION_RETURN_DICT_VALUE
from ner_v2.detectors.numeral.number.number_detection import NumberDetector

# Constants
_re_flags = re.UNICODE | re.V1
PUNCTUATION_CHARACTERS = list(string.punctuation + '। ')
CAPTURE_RANGE_RE = re.escape(r"{(?P<minimum>\d+),(?P<maximum>\d+)}")
EMAIL_CORRECTION_RE = '@? ?(at)? ?(the)? ?(rate)'
AT_SYMBOL = '@'


def edit_distance(string1, string2, insertion_cost=1, deletion_cost=1, substitution_cost=2, max_distance=None):
"""
Calculate the weighted levenshtein distance between two strings

Args:
string1 (unicode): unicode string. If any encoded string type 'str' is passed, it will be decoded using utf-8
string2 (unicode): unicode string. If any encoded string type 'str' is passed, it will be decoded using utf-8
insertion_cost (int, optional): cost penalty for insertion operation, defaults to 1
deletion_cost (int, optional): cost penalty for deletion operation, defaults to 1
substitution_cost (int, optional): cost penalty for substitution operation, defaults to 2
max_distance (int, optional): Stop computing edit distance if it grows larger than this argument.
If None complete edit distance is returned. Defaults to None

For Example:
edit_distance('hello', 'helllo', max_distance=3)
>> 1

edit_distance('beautiful', 'beauty', max_distance=3)
>> 3

NOTE: Since, minimum edit distance is time consuming process, we have defined max_distance attribute.
So, whenever distance exceeds the max_distance the function will break and return the max_distance else
it will return levenshtein distance
"""
if isinstance(string1, bytes):
string1 = string1.decode('utf-8')

if isinstance(string2, bytes):
string2 = string2.decode('utf-8')

if len(string1) > len(string2):
string1, string2 = string2, string1
distances = list(range(len(string1) + 1))
for index2, char2 in enumerate(string2):
new_distances = [index2 + 1]
for index1, char1 in enumerate(string1):
if char1 == char2:
new_distances.append(distances[index1])
else:
new_distances.append(min((distances[index1] + substitution_cost,
distances[index1 + 1] + insertion_cost,
new_distances[-1] + deletion_cost)))
distances = new_distances
if max_distance and min(new_distances) > max_distance:
return max_distance

return distances[-1]


def fit_text_to_format(input_text, regex_pattern, insert_edits=None):
naseem-shaik marked this conversation as resolved.
Show resolved Hide resolved
"""
Used to modify text to match the given regex pattern.
Args:
input_text (str): processed string with numerals and character constants fixed
regex_pattern (str): pattern to match
insert_edits (int): number of character deletes allowed for fuzzy matching

Returns:
input_text (str): modified text

Example:
fit_text_to_format(input_text='1 2 3 45', regex_pattern='\d{5}')

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

W605 invalid escape sequence '\d'

>> "12345"
"""

if not insert_edits:
# A rough heuristic to allow (#_of_punctuations + 2) extra characters during fuzzy matching
count = lambda l1, l2: sum([1 for x in l1 if x in l2]) # pylint: disable=E731

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

E731 do not assign a lambda expression, use a def

insert_edits = count(input_text, PUNCTUATION_CHARACTERS) + 2

pattern = f'(?b)({regex_pattern}){{i<={insert_edits}}}'
pattern = re.compile(pattern, flags=_re_flags)
matched_format = pattern.search(input_text)

# Fuzzy matching acts in a non-greedy fashion, hence the following resolution of reverse iterations
# Eg. For regex="\d{3,5}" text="12345", fuzzy match detects "123"
# Therefore we start checking from the maximum number
range_matches = re.finditer(CAPTURE_RANGE_RE, regex_pattern)
for match in range_matches:
min_range = int(match["minimum"])
max_range = int(match["maximum"])
for i in range(max_range, min_range - 1, -1):
temp_pattern = regex_pattern.replace(match.group(), f'{{{i}}}')
pattern = f'(?b)({temp_pattern}){{i<={insert_edits}}}'
pattern = re.compile(pattern)
matched_format = pattern.search(input_text)
if matched_format:
regex_pattern = temp_pattern
break
if i == min_range:
return input_text

if matched_format:
if any(matched_format.fuzzy_counts):
# Insert edit positions are returned at position 1 in the fuzzy_changes tuple
fuzzy_edits = matched_format.fuzzy_changes[1]
# Removing "additional characters" in text
for corrector, index in enumerate(sorted(fuzzy_edits, reverse=False)):
index -= corrector
input_text = _omit_character_by_index(input_text, index)
return input_text


def _omit_character_by_index(text, index) -> str:
return text[:index] + text[index + 1:]


def resolve_numerals(text, language) -> str:
"""
Uses NumberDetector to resolve numeric occurrences in text for both English and Hindi.
naseem-shaik marked this conversation as resolved.
Show resolved Hide resolved
Args:
text (str): processed string with numerals and character constants fixed
language (str): Language for NumberDetector
Returns:
processed_text (str): modified text
"""
processed_text = text
number_detector = NumberDetector('asr_dummy', language=language)
# FIXME: Detection fails if text starts with '0' since number detector discards it
detected_numerals, original_texts = number_detector.detect_entity(text=text)
detected_numerals_hi, original_texts_hi = number_detector.detect_entity(text=text, language='hi')
naseem-shaik marked this conversation as resolved.
Show resolved Hide resolved
detected_numerals.extend(detected_numerals_hi)
original_texts.extend(original_texts_hi)
for number, original_text in zip(detected_numerals, original_texts):
substitution_reg = re.compile(re.escape(original_text), re.IGNORECASE)
processed_text = substitution_reg.sub(number[NUMBER_DETECTION_RETURN_DICT_VALUE], processed_text)
return processed_text


def resolve_characters(text) -> str:
"""
Uses a dictionary to resolve hindi character occurrences in text to English.
Args:
text (str): processed string with numerals fixed
Returns:
processed_text (str): modified text
"""
processed_text = text
occurrences = []
for char in CHARACTER_CONSTANTS.keys():
if char in text:
occurrences.append(char)
for fragment in sorted(occurrences, key=len):
naseem-shaik marked this conversation as resolved.
Show resolved Hide resolved
processed_text = processed_text.replace(fragment, CHARACTER_CONSTANTS[fragment])
return processed_text


def perform_asr_correction(input_text, regex_pattern, language='en'):
"""
Main function for text normalization for ASR retrieved input.
Performs resolution for numerics and characters
and uses fuzzy matching to modify text as per the RegEx provided.

Example procedure:
input_text = "बी nine nine three zero"
regex = r"\w\d{4}"

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

W605 invalid escape sequence '\w'
W605 invalid escape sequence '\d'


>> resolve_numerals(input_text)
"बी 9 9 3 0"
>> resolve_characters(processed_text)
"B 9 9 3 0"
>> fit_text_to_format(processed_text, regex_pattern)
"B9930"
Returns:
'B9930'
Args:
input_text (str): original text (as per ASR engine output)
regex_pattern (str): Regex pattern to match
language (str): Source language
Returns:
processed_text (str): modified text
"""
processed_text = resolve_numerals(input_text, language)
processed_text = resolve_characters(processed_text)
processed_text = fit_text_to_format(processed_text, regex_pattern)
ner_logger.info(f'ASR Processing converted {input_text} --> {processed_text}')
return processed_text


def preprocess_asr_email(text):
"""
Handles common error occurrences in Email ASR

Args:
text (str): original text (as per ASR engine output)
Returns:
processed_text (str): modified text
"""
processed_text = re.sub(EMAIL_CORRECTION_RE, AT_SYMBOL, text)
processed_text = re.sub(' at ', AT_SYMBOL, processed_text)
return processed_text
1 change: 1 addition & 0 deletions ner_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
PARAMETER_BOT_MESSAGE = 'bot_message'
PARAMETER_TIMEZONE = 'timezone'
PARAMETER_REGEX = 'regex'
PARAMETER_ASR = 'is_asr'
PARAMETER_PAST_DATE_REFERENCED = 'past_date_referenced'
PARAMETER_RANGE_ENABLED = 'range_enabled'

Expand Down
21 changes: 13 additions & 8 deletions ner_v1/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@

import ast
import json

import six
from django.http import HttpResponse
from django.views.decorators.csrf import csrf_exempt
from django.views.decorators.http import require_http_methods
from elasticsearch import exceptions as es_exceptions

from chatbot_ner.config import ner_logger
from datastore.exceptions import DataStoreRequestException
from language_utilities.constant import ENGLISH_LANG
from ner_constants import (PARAMETER_MESSAGE, PARAMETER_ENTITY_NAME, PARAMETER_STRUCTURED_VALUE,
from ner_constants import (PARAMETER_MESSAGE, PARAMETER_ENTITY_NAME, PARAMETER_STRUCTURED_VALUE, PARAMETER_ASR,
PARAMETER_FALLBACK_VALUE, PARAMETER_BOT_MESSAGE, PARAMETER_TIMEZONE, PARAMETER_REGEX,
PARAMETER_LANGUAGE_SCRIPT, PARAMETER_SOURCE_LANGUAGE, PARAMETER_PRIOR_RESULTS)

from ner_v1.chatbot.combine_detection_logic import combine_output_of_detection_logic_and_tag
from ner_v1.chatbot.entity_detection import (get_location, get_phone_number, get_email, get_city, get_pnr,
get_number, get_passenger_count, get_shopping_size, get_time,
Expand All @@ -22,8 +22,6 @@
from ner_v1.chatbot.tag_message import run_ner
from ner_v1.constant import (PARAMETER_MIN_TOKEN_LEN_FUZZINESS, PARAMETER_FUZZINESS, PARAMETER_MIN_DIGITS,
PARAMETER_MAX_DIGITS)
from django.views.decorators.csrf import csrf_exempt
from django.views.decorators.http import require_http_methods


def to_bool(value):
Expand Down Expand Up @@ -62,6 +60,7 @@ def get_parameters_dictionary(request):
PARAMETER_BOT_MESSAGE: request.GET.get('bot_message'),
PARAMETER_TIMEZONE: request.GET.get('timezone'),
PARAMETER_REGEX: request.GET.get('regex'),
PARAMETER_ASR: request.GET.get('is_asr', False),
PARAMETER_LANGUAGE_SCRIPT: request.GET.get('language_script', ENGLISH_LANG),
PARAMETER_SOURCE_LANGUAGE: request.GET.get('source_language', ENGLISH_LANG),
PARAMETER_FUZZINESS: request.GET.get('fuzziness'),
Expand Down Expand Up @@ -94,6 +93,7 @@ def parse_post_request(request):
PARAMETER_BOT_MESSAGE: request_data.get('bot_message'),
PARAMETER_TIMEZONE: request_data.get('timezone'),
PARAMETER_REGEX: request_data.get('regex'),
PARAMETER_ASR: request_data.get('is_asr'),
naseem-shaik marked this conversation as resolved.
Show resolved Hide resolved
PARAMETER_LANGUAGE_SCRIPT: request_data.get('language_script', ENGLISH_LANG),
PARAMETER_SOURCE_LANGUAGE: request_data.get('source_language', ENGLISH_LANG),
PARAMETER_FUZZINESS: request_data.get('fuzziness'),
Expand Down Expand Up @@ -341,11 +341,15 @@ def regex(request):
"""
try:
parameters_dict = parse_parameters_from_request(request)
entity_output = get_regex(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME],
entity_output = get_regex(parameters_dict[PARAMETER_MESSAGE],
parameters_dict[PARAMETER_ENTITY_NAME],
parameters_dict[PARAMETER_STRUCTURED_VALUE],
parameters_dict[PARAMETER_FALLBACK_VALUE],
parameters_dict[PARAMETER_BOT_MESSAGE],
parameters_dict[PARAMETER_REGEX])
parameters_dict[PARAMETER_REGEX],
parameters_dict[PARAMETER_ASR],
parameters_dict[PARAMETER_SOURCE_LANGUAGE]
)
ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output))
except TypeError as e:
ner_logger.exception('Exception for regex: %s ' % e)
Expand All @@ -368,7 +372,8 @@ def email(request):
entity_output = get_email(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME],
parameters_dict[PARAMETER_STRUCTURED_VALUE],
parameters_dict[PARAMETER_FALLBACK_VALUE],
parameters_dict[PARAMETER_BOT_MESSAGE])
parameters_dict[PARAMETER_BOT_MESSAGE],
parameters_dict[PARAMETER_ASR])
ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output))
except TypeError as e:
ner_logger.exception('Exception for email: %s ' % e)
Expand Down
Loading