hellohaptik · tanaya-b · Apr 20, 2022 · Mar 16, 2022 · Mar 21, 2022 · Mar 22, 2022
diff --git a/lib/nlp/levenshtein_distance.py b/lib/nlp/levenshtein_distance.py
diff --git a/lib/nlp/text_normalization.py b/lib/nlp/text_normalization.py
@@ -0,0 +1,208 @@
+import regex as re
+import string
+from six.moves import range
+
+from chatbot_ner.config import ner_logger
+from ner_v1.detectors.pattern.regex.data.character_constants import CHARACTER_CONSTANTS
+from ner_v2.detectors.numeral.constant import NUMBER_DETECTION_RETURN_DICT_VALUE
+from ner_v2.detectors.numeral.number.number_detection import NumberDetector
+
+# Constants
+_re_flags = re.UNICODE | re.V1
+PUNCTUATION_CHARACTERS = list(string.punctuation + '। ')
+CAPTURE_RANGE_RE = re.escape(r"{(?P<minimum>\d+),(?P<maximum>\d+)}")
+EMAIL_CORRECTION_RE = '@? ?(at)? ?(the)? ?(rate)'
+AT_SYMBOL = '@'
+
+
+def edit_distance(string1, string2, insertion_cost=1, deletion_cost=1, substitution_cost=2, max_distance=None):
+    """
+    Calculate the weighted levenshtein distance between two strings
+
+    Args:
+        string1 (unicode): unicode string. If any encoded string type 'str' is passed, it will be decoded using utf-8
+        string2 (unicode): unicode string. If any encoded string type 'str' is passed, it will be decoded using utf-8
+        insertion_cost (int, optional): cost penalty for insertion operation, defaults to 1
+        deletion_cost (int, optional): cost penalty for deletion operation, defaults to 1
+        substitution_cost (int, optional): cost penalty for substitution operation, defaults to 2
+        max_distance (int, optional): Stop computing edit distance if it grows larger than this argument.
+                                      If None complete edit distance is returned. Defaults to None
+
+    For Example:
+        edit_distance('hello', 'helllo', max_distance=3)
+        >> 1
+
+        edit_distance('beautiful', 'beauty', max_distance=3)
+        >> 3
+
+    NOTE: Since, minimum edit distance is time consuming process, we have defined max_distance attribute.
+    So, whenever distance exceeds the max_distance the function will break and return the max_distance else
+    it will return levenshtein distance
+    """
+    if isinstance(string1, bytes):
+        string1 = string1.decode('utf-8')
+
+    if isinstance(string2, bytes):
+        string2 = string2.decode('utf-8')
+
+    if len(string1) > len(string2):
+        string1, string2 = string2, string1
+    distances = list(range(len(string1) + 1))
+    for index2, char2 in enumerate(string2):
+        new_distances = [index2 + 1]
+        for index1, char1 in enumerate(string1):
+            if char1 == char2:
+                new_distances.append(distances[index1])
+            else:
+                new_distances.append(min((distances[index1] + substitution_cost,
+                                          distances[index1 + 1] + insertion_cost,
+                                          new_distances[-1] + deletion_cost)))
+        distances = new_distances
+        if max_distance and min(new_distances) > max_distance:
+            return max_distance
+
+    return distances[-1]
+
+
+def fit_text_to_format(input_text, regex_pattern, insert_edits=None):
+    """
+    Used to modify text to match the given regex pattern.
+    Args:
+        input_text (str): processed string with numerals and character constants fixed
+        regex_pattern (str): pattern to match
+        insert_edits (int): number of character deletes allowed for fuzzy matching
+
+    Returns:
+        input_text (str): modified text
+
+    Example:
+        fit_text_to_format(input_text='1 2 3 45', regex_pattern='\d{5}')
+        >> "12345"
+    """
+
+    if not insert_edits:
+        # A rough heuristic to allow (#_of_punctuations + 2) extra characters during fuzzy matching
+        count = lambda l1, l2: sum([1 for x in l1 if x in l2])  # pylint: disable=E731
+        insert_edits = count(input_text, PUNCTUATION_CHARACTERS) + 2
+
+    pattern = f'(?b)({regex_pattern}){{i<={insert_edits}}}'
+    pattern = re.compile(pattern, flags=_re_flags)
+    matched_format = pattern.search(input_text)
+
+    # Fuzzy matching acts in a non-greedy fashion, hence the following resolution of reverse iterations
+    # Eg. For regex="\d{3,5}" text="12345", fuzzy match detects "123"
+    # Therefore we start checking from the maximum number
+    range_matches = re.finditer(CAPTURE_RANGE_RE, regex_pattern)
+    for match in range_matches:
+        min_range = int(match["minimum"])
+        max_range = int(match["maximum"])
+        for i in range(max_range, min_range - 1, -1):
+            temp_pattern = regex_pattern.replace(match.group(), f'{{{i}}}')
+            pattern = f'(?b)({temp_pattern}){{i<={insert_edits}}}'
+            pattern = re.compile(pattern)
+            matched_format = pattern.search(input_text)
+            if matched_format:
+                regex_pattern = temp_pattern
+                break
+            if i == min_range:
+                return input_text
+
+    if matched_format:
+        if any(matched_format.fuzzy_counts):
+            # Insert edit positions are returned at position 1 in the fuzzy_changes tuple
+            fuzzy_edits = matched_format.fuzzy_changes[1]
+            # Removing "additional characters" in text
+            for corrector, index in enumerate(sorted(fuzzy_edits, reverse=False)):
+                index -= corrector
+                input_text = _omit_character_by_index(input_text, index)
+    return input_text
+
+
+def _omit_character_by_index(text, index) -> str:
+    return text[:index] + text[index + 1:]
+
+
+def resolve_numerals(text, language) -> str:
+    """
+    Uses NumberDetector to resolve numeric occurrences in text for both English and Hindi.
+    Args:
+        text (str): processed string with numerals and character constants fixed
+        language (str): Language for NumberDetector
+    Returns:
+        processed_text (str): modified text
+    """
+    processed_text = text
+    number_detector = NumberDetector('asr_dummy', language=language)
+    # FIXME: Detection fails if text starts with '0' since number detector discards it
+    detected_numerals, original_texts = number_detector.detect_entity(text=text)
+    detected_numerals_hi, original_texts_hi = number_detector.detect_entity(text=text, language='hi')
+    detected_numerals.extend(detected_numerals_hi)
+    original_texts.extend(original_texts_hi)
+    for number, original_text in zip(detected_numerals, original_texts):
+        substitution_reg = re.compile(re.escape(original_text), re.IGNORECASE)
+        processed_text = substitution_reg.sub(number[NUMBER_DETECTION_RETURN_DICT_VALUE], processed_text)
+    return processed_text
+
+
+def resolve_characters(text) -> str:
+    """
+    Uses a dictionary to resolve hindi character occurrences in text to English.
+    Args:
+        text (str): processed string with numerals fixed
+    Returns:
+        processed_text (str): modified text
+    """
+    processed_text = text
+    occurrences = []
+    for char in CHARACTER_CONSTANTS.keys():
+        if char in text:
+            occurrences.append(char)
+    for fragment in sorted(occurrences, key=len):
+        processed_text = processed_text.replace(fragment, CHARACTER_CONSTANTS[fragment])
+    return processed_text
+
+
+def perform_asr_correction(input_text, regex_pattern, language='en'):
+    """
+    Main function for text normalization for ASR retrieved input.
+    Performs resolution for numerics and characters
+    and uses fuzzy matching to modify text as per the RegEx provided.
+
+    Example procedure:
+        input_text = "बी nine nine three zero"
+        regex = r"\w\d{4}"
+
+        >> resolve_numerals(input_text)
+            "बी 9 9 3 0"
+        >> resolve_characters(processed_text)
+            "B 9 9 3 0"
+        >> fit_text_to_format(processed_text, regex_pattern)
+            "B9930"
+        Returns:
+            'B9930'
+    Args:
+        input_text (str): original text (as per ASR engine output)
+        regex_pattern (str): Regex pattern to match
+        language (str): Source language
+    Returns:
+        processed_text (str): modified text
+    """
+    processed_text = resolve_numerals(input_text, language)
+    processed_text = resolve_characters(processed_text)
+    processed_text = fit_text_to_format(processed_text, regex_pattern)
+    ner_logger.info(f'ASR Processing converted {input_text} --> {processed_text}')
+    return processed_text
+
+
+def preprocess_asr_email(text):
+    """
+    Handles common error occurrences in Email ASR
+
+    Args:
+        text (str): original text (as per ASR engine output)
+    Returns:
+        processed_text (str): modified text
+    """
+    processed_text = re.sub(EMAIL_CORRECTION_RE, AT_SYMBOL, text)
+    processed_text = re.sub(' at ', AT_SYMBOL, processed_text)
+    return processed_text
diff --git a/ner_constants.py b/ner_constants.py
@@ -48,6 +48,7 @@
 PARAMETER_BOT_MESSAGE = 'bot_message'
 PARAMETER_TIMEZONE = 'timezone'
 PARAMETER_REGEX = 'regex'
+PARAMETER_ASR = 'is_asr'
 PARAMETER_PAST_DATE_REFERENCED = 'past_date_referenced'
 PARAMETER_RANGE_ENABLED = 'range_enabled'
 

diff --git a/ner_v1/api.py b/ner_v1/api.py
@@ -2,18 +2,18 @@
 
 import ast
 import json
-
 import six
 from django.http import HttpResponse
+from django.views.decorators.csrf import csrf_exempt
+from django.views.decorators.http import require_http_methods
 from elasticsearch import exceptions as es_exceptions
 
 from chatbot_ner.config import ner_logger
 from datastore.exceptions import DataStoreRequestException
 from language_utilities.constant import ENGLISH_LANG
-from ner_constants import (PARAMETER_MESSAGE, PARAMETER_ENTITY_NAME, PARAMETER_STRUCTURED_VALUE,
+from ner_constants import (PARAMETER_MESSAGE, PARAMETER_ENTITY_NAME, PARAMETER_STRUCTURED_VALUE, PARAMETER_ASR,
                            PARAMETER_FALLBACK_VALUE, PARAMETER_BOT_MESSAGE, PARAMETER_TIMEZONE, PARAMETER_REGEX,
                            PARAMETER_LANGUAGE_SCRIPT, PARAMETER_SOURCE_LANGUAGE, PARAMETER_PRIOR_RESULTS)
-
 from ner_v1.chatbot.combine_detection_logic import combine_output_of_detection_logic_and_tag
 from ner_v1.chatbot.entity_detection import (get_location, get_phone_number, get_email, get_city, get_pnr,
                                              get_number, get_passenger_count, get_shopping_size, get_time,
@@ -22,8 +22,6 @@
 from ner_v1.chatbot.tag_message import run_ner
 from ner_v1.constant import (PARAMETER_MIN_TOKEN_LEN_FUZZINESS, PARAMETER_FUZZINESS, PARAMETER_MIN_DIGITS,
                              PARAMETER_MAX_DIGITS)
-from django.views.decorators.csrf import csrf_exempt
-from django.views.decorators.http import require_http_methods
 
 
 def to_bool(value):
@@ -62,6 +60,7 @@ def get_parameters_dictionary(request):
         PARAMETER_BOT_MESSAGE: request.GET.get('bot_message'),
         PARAMETER_TIMEZONE: request.GET.get('timezone'),
         PARAMETER_REGEX: request.GET.get('regex'),
+        PARAMETER_ASR: request.GET.get('is_asr', False),
         PARAMETER_LANGUAGE_SCRIPT: request.GET.get('language_script', ENGLISH_LANG),
         PARAMETER_SOURCE_LANGUAGE: request.GET.get('source_language', ENGLISH_LANG),
         PARAMETER_FUZZINESS: request.GET.get('fuzziness'),
@@ -94,6 +93,7 @@ def parse_post_request(request):
         PARAMETER_BOT_MESSAGE: request_data.get('bot_message'),
         PARAMETER_TIMEZONE: request_data.get('timezone'),
         PARAMETER_REGEX: request_data.get('regex'),
+        PARAMETER_ASR: request_data.get('is_asr'),
         PARAMETER_LANGUAGE_SCRIPT: request_data.get('language_script', ENGLISH_LANG),
         PARAMETER_SOURCE_LANGUAGE: request_data.get('source_language', ENGLISH_LANG),
         PARAMETER_FUZZINESS: request_data.get('fuzziness'),
@@ -341,11 +341,15 @@ def regex(request):
     """
     try:
         parameters_dict = parse_parameters_from_request(request)
-        entity_output = get_regex(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME],
+        entity_output = get_regex(parameters_dict[PARAMETER_MESSAGE],
+                                  parameters_dict[PARAMETER_ENTITY_NAME],
                                   parameters_dict[PARAMETER_STRUCTURED_VALUE],
                                   parameters_dict[PARAMETER_FALLBACK_VALUE],
                                   parameters_dict[PARAMETER_BOT_MESSAGE],
-                                  parameters_dict[PARAMETER_REGEX])
+                                  parameters_dict[PARAMETER_REGEX],
+                                  parameters_dict[PARAMETER_ASR],
+                                  parameters_dict[PARAMETER_SOURCE_LANGUAGE]
+                                  )
         ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output))
     except TypeError as e:
         ner_logger.exception('Exception for regex: %s ' % e)
@@ -368,7 +372,8 @@ def email(request):
         entity_output = get_email(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME],
                                   parameters_dict[PARAMETER_STRUCTURED_VALUE],
                                   parameters_dict[PARAMETER_FALLBACK_VALUE],
-                                  parameters_dict[PARAMETER_BOT_MESSAGE])
+                                  parameters_dict[PARAMETER_BOT_MESSAGE],
+                                  parameters_dict[PARAMETER_ASR])
         ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output))
     except TypeError as e:
         ner_logger.exception('Exception for email: %s ' % e)