Regex based modifications

hellohaptik · tanaya-b · Apr 21, 2022 · Mar 16, 2022 · Mar 21, 2022 · Mar 22, 2022
commit f008b4251563da781073bdd1cbf43af5edeb4527
diff --git a/lib/nlp/levenshtein_distance.py b/lib/nlp/levenshtein_distance.py
diff --git a/lib/nlp/text_normalization.py b/lib/nlp/text_normalization.py
@@ -0,0 +1,145 @@
+import regex as re
+import string
+from six.moves import range
+
+from chatbot_ner.config import ner_logger
+from ner_v1.detectors.pattern.regex.data.character_constants import CHARACTER_CONSTANTS
+from ner_v2.detectors.numeral.constant import NUMBER_DETECTION_RETURN_DICT_VALUE
+from ner_v2.detectors.numeral.number.number_detection import NumberDetector
+
+PUNCTUATION_CHARACTERS = list(string.punctuation + '। ')
+
+
+def edit_distance(string1, string2, insertion_cost=1, deletion_cost=1, substitution_cost=2, max_distance=None):
+    """
+    Calculate the weighted levenshtein distance between two strings
+
+    Args:
+        string1 (unicode): unicode string. If any encoded string type 'str' is passed, it will be decoded using utf-8
+        string2 (unicode): unicode string. If any encoded string type 'str' is passed, it will be decoded using utf-8
+        insertion_cost (int, optional): cost penalty for insertion operation, defaults to 1
+        deletion_cost (int, optional): cost penalty for deletion operation, defaults to 1
+        substitution_cost (int, optional): cost penalty for substitution operation, defaults to 2
+        max_distance (int, optional): Stop computing edit distance if it grows larger than this argument.
+                                      If None complete edit distance is returned. Defaults to None
+
+    For Example:
+        edit_distance('hello', 'helllo', max_distance=3)
+        >> 1
+
+        edit_distance('beautiful', 'beauty', max_distance=3)
+        >> 3
+
+    NOTE: Since, minimum edit distance is time consuming process, we have defined max_distance attribute.
+    So, whenever distance exceeds the max_distance the function will break and return the max_distance else
+    it will return levenshtein distance
+    """
+    if isinstance(string1, bytes):
+        string1 = string1.decode('utf-8')
+
+    if isinstance(string2, bytes):
+        string2 = string2.decode('utf-8')
+
+    if len(string1) > len(string2):
+        string1, string2 = string2, string1
+    distances = list(range(len(string1) + 1))
+    for index2, char2 in enumerate(string2):
+        new_distances = [index2 + 1]
+        for index1, char1 in enumerate(string1):
+            if char1 == char2:
+                new_distances.append(distances[index1])
+            else:
+                new_distances.append(min((distances[index1] + substitution_cost,
+                                          distances[index1 + 1] + insertion_cost,
+                                          new_distances[-1] + deletion_cost)))
+        distances = new_distances
+        if max_distance and min(new_distances) > max_distance:
+            return max_distance
+
+    return distances[-1]
+
+
+def fit_text_to_format(input_text, regex_pattern, insert_edits=None):
+    """
+    Used to modify text to match the given regex pattern.
+    Args:
+        input_text (str): processed string with numerals and character constants fixed
+        regex_pattern (str): pattern to match
+        insert_edits (int): number of character deletes allowed for fuzzy matching
+
+    Returns:
+        input_text (str): modified text
+    """
+    if insert_edits:
+        pattern = f'(?b)({regex_pattern}){{i<={insert_edits}}}'
+    else:
+        count = lambda l1, l2: sum([1 for x in l1 if x in l2])
+        insert_edits = count(input_text, PUNCTUATION_CHARACTERS) + 2
+        pattern = f'(?b)({regex_pattern}){{i<={insert_edits}}}'
+    pattern = re.compile(pattern)
+    matched_format = pattern.search(input_text)
+    if matched_format:
+        if any(matched_format.fuzzy_counts):
+            fuzzy_edits = matched_format.fuzzy_changes[1]  # Insert edits are returned at position 1 in the tuple
+            for index in fuzzy_edits:
+                input_text = _omit_character_by_index(input_text, index)
+    return input_text
+
+
+def _omit_character_by_index(text, index) -> str:
+    return text[:index] + text[index + 1:]
+
+
+def resolve_numerals(text) -> str:
+    """
+    Uses NumberDetector to resolve numeric occurrences in text for both English and Hindi.
+    Args:
+        text (str): processed string with numerals and character constants fixed
+    Returns:
+        processed_text (str): modified text
+    """
+    processed_text = text
+    number_detector = NumberDetector('asr_dummy', language='en')
+    detected_numerals, original_texts = number_detector.detect_entity(text=text)
+    detected_numerals_hi, original_texts_hi = number_detector.detect_entity(text=text, language='hi')
+    for number, original_text in zip(detected_numerals.extend(detected_numerals_hi),
+                                     original_texts.extend(original_texts_hi)):
+        substitution_reg = re.compile(re.escape(original_text), re.IGNORECASE)
+        processed_text = substitution_reg.sub(number[NUMBER_DETECTION_RETURN_DICT_VALUE], processed_text)
+    return processed_text
+
+
+def resolve_characters(text) -> str:
+    """
+    Uses a dictionary to resolve hindi character occurrences in text to English.
+    Args:
+        text (str): processed string with numerals fixed
+    Returns:
+        processed_text (str): modified text
+    """
+    processed_text = text
+    occurrences = []
+    for char in CHARACTER_CONSTANTS.keys():
+        if char in text:
+            occurrences.append(char)
+    for fragment in sorted(occurrences, key=len):
+        processed_text = processed_text.replace(fragment, CHARACTER_CONSTANTS[fragment])
+    return processed_text
+
+
+def perform_asr_correction(input_text, regex_pattern):
+    """
+    Main function for text normalization for ASR retrieved input.
+    Performs resolution for numerics and characters
+    and uses fuzzy matching to modify text as per the RegEx provided.
+
+    Args:
+        input_text (str): original text (as per ASR engine output)
+        regex_pattern (str): Regex pattern to match
+    Returns:
+        processed_text (str): modified text
+    """
+    processed_text = resolve_numerals(input_text)
+    processed_text = resolve_characters(processed_text)
+    processed_text = fit_text_to_format(processed_text, regex_pattern)
+    return processed_text
diff --git a/ner_constants.py b/ner_constants.py
@@ -48,6 +48,7 @@
 PARAMETER_BOT_MESSAGE = 'bot_message'
 PARAMETER_TIMEZONE = 'timezone'
 PARAMETER_REGEX = 'regex'
+PARAMETER_ASR = 'is_asr'
 PARAMETER_PAST_DATE_REFERENCED = 'past_date_referenced'
 PARAMETER_RANGE_ENABLED = 'range_enabled'
 

diff --git a/ner_v1/api.py b/ner_v1/api.py
@@ -2,9 +2,10 @@
 
 import ast
 import json
-
 import six
 from django.http import HttpResponse
+from django.views.decorators.csrf import csrf_exempt
+from django.views.decorators.http import require_http_methods
 from elasticsearch import exceptions as es_exceptions
 
 from chatbot_ner.config import ner_logger
@@ -13,7 +14,6 @@
 from ner_constants import (PARAMETER_MESSAGE, PARAMETER_ENTITY_NAME, PARAMETER_STRUCTURED_VALUE,
                            PARAMETER_FALLBACK_VALUE, PARAMETER_BOT_MESSAGE, PARAMETER_TIMEZONE, PARAMETER_REGEX,
                            PARAMETER_LANGUAGE_SCRIPT, PARAMETER_SOURCE_LANGUAGE, PARAMETER_PRIOR_RESULTS)
-
 from ner_v1.chatbot.combine_detection_logic import combine_output_of_detection_logic_and_tag
 from ner_v1.chatbot.entity_detection import (get_location, get_phone_number, get_email, get_city, get_pnr,
                                              get_number, get_passenger_count, get_shopping_size, get_time,
@@ -22,8 +22,6 @@
 from ner_v1.chatbot.tag_message import run_ner
 from ner_v1.constant import (PARAMETER_MIN_TOKEN_LEN_FUZZINESS, PARAMETER_FUZZINESS, PARAMETER_MIN_DIGITS,
                              PARAMETER_MAX_DIGITS)
-from django.views.decorators.csrf import csrf_exempt
-from django.views.decorators.http import require_http_methods
 
 
 def to_bool(value):
@@ -62,6 +60,7 @@ def get_parameters_dictionary(request):
         PARAMETER_BOT_MESSAGE: request.GET.get('bot_message'),
         PARAMETER_TIMEZONE: request.GET.get('timezone'),
         PARAMETER_REGEX: request.GET.get('regex'),
+        PARAMETER_ASR: request.GET.get('is_asr', False),
         PARAMETER_LANGUAGE_SCRIPT: request.GET.get('language_script', ENGLISH_LANG),
         PARAMETER_SOURCE_LANGUAGE: request.GET.get('source_language', ENGLISH_LANG),
         PARAMETER_FUZZINESS: request.GET.get('fuzziness'),
@@ -94,6 +93,7 @@ def parse_post_request(request):
         PARAMETER_BOT_MESSAGE: request_data.get('bot_message'),
         PARAMETER_TIMEZONE: request_data.get('timezone'),
         PARAMETER_REGEX: request_data.get('regex'),
+        PARAMETER_ASR: request_data.get('is_asr'),
         PARAMETER_LANGUAGE_SCRIPT: request_data.get('language_script', ENGLISH_LANG),
         PARAMETER_SOURCE_LANGUAGE: request_data.get('source_language', ENGLISH_LANG),
         PARAMETER_FUZZINESS: request_data.get('fuzziness'),
@@ -345,7 +345,8 @@ def regex(request):
                                   parameters_dict[PARAMETER_STRUCTURED_VALUE],
                                   parameters_dict[PARAMETER_FALLBACK_VALUE],
                                   parameters_dict[PARAMETER_BOT_MESSAGE],
-                                  parameters_dict[PARAMETER_REGEX])
+                                  parameters_dict[PARAMETER_REGEX],
+                                  parameters_dict[PARAMETER_ASR])
         ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output))
     except TypeError as e:
         ner_logger.exception('Exception for regex: %s ' % e)

diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py
@@ -621,7 +621,7 @@ def get_pnr(message, entity_name, structured_value, fallback_value, bot_message)
                                 bot_message=bot_message)
 
 
-def get_regex(message, entity_name, structured_value, fallback_value, bot_message, pattern):
+def get_regex(message, entity_name, structured_value, fallback_value, bot_message, pattern, is_asr=False):
     """Use RegexDetector to detect text that abide by the specified
         pattern.
         The meta_data consists the pattern
@@ -658,7 +658,7 @@ def get_regex(message, entity_name, structured_value, fallback_value, bot_messag
             >> [{'detection': 'message', 'original_text': '123', 'entity_value': {'value': '123'}}]
 
     """
-    regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern)
+    regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern, enable_asr=is_asr)
     if structured_value:
         entity_list, original_text_list = regex_detector.detect_entity(text=structured_value)
         if entity_list:

diff --git a/ner_v1/detectors/pattern/regex/data/__init__.py b/ner_v1/detectors/pattern/regex/data/__init__.py
diff --git a/ner_v1/detectors/pattern/regex/data/character_constants.py b/ner_v1/detectors/pattern/regex/data/character_constants.py
@@ -0,0 +1,35 @@
+CHARACTER_CONSTANTS = {
+    "ए": "A",
+    "ब": "B",
+    "सी": "C",
+    "डी": "D",
+    "इ": "E",
+    "ई": "E",
+    "एफ": "F",
+    "ऍफ़": "F",
+    "जी": "G",
+    "एच": "H",
+    "आय": "I",
+    "जे": "J",
+    "के": "K",
+    "एल": "L",
+    "एम्": "M",
+    "एम": "M",
+    "एन": "N",
+    "ओ": "O",
+    "पी": "P",
+    "क्यू": "Q",
+    "आर": "R",
+    "एस": "S",
+    "टी": "T",
+    "यु": "U",
+    "वि": "V",
+    "वी": "V",
+    "डब्ल्यू": "W",
+    "डब्ल्यु": "W",
+    "डबल्यू": "W",
+    "एक्स": "X",
+    "वाय": "Y",
+    "ज़ेड": "Z",
+    "ज़ी": "Z",
+}
diff --git a/ner_v1/detectors/pattern/regex/regex_detection.py b/ner_v1/detectors/pattern/regex/regex_detection.py
@@ -10,6 +10,7 @@
 from typing import List
 
 from chatbot_ner.config import ner_logger
+from lib.nlp.text_normalization import perform_asr_correction
 
 try:
     import regex as re
@@ -41,7 +42,7 @@ class RegexDetector(object):
          pattern (raw str or str or unicode): pattern to be compiled into a re object
     """
 
-    def __init__(self, entity_name, pattern, re_flags=DEFAULT_FLAGS, max_matches=50):
+    def __init__(self, entity_name, pattern, enable_asr=False, re_flags=DEFAULT_FLAGS, max_matches=50):
         """
         Args:
             entity_name (str): an indicator value as tag to replace detected values
@@ -57,6 +58,8 @@ def __init__(self, entity_name, pattern, re_flags=DEFAULT_FLAGS, max_matches=50)
         self.text = ''
         self.tagged_text = ''
         self.processed_text = ''
+        self.enable_asr = enable_asr
+        self.uncompiled_pattern = pattern
         try:
             self.pattern = re.compile(pattern, flags=re_flags)
         except re.error:
@@ -95,7 +98,10 @@ def detect_entity(self, text):
 
         """
         self.text = text
-        self.processed_text = self.text
+        if self.enable_asr:
+            self.processed_text = perform_asr_correction(self.text, self.uncompiled_pattern)
+        else:
+            self.processed_text = self.text
         self.tagged_text = self.text
         match_list, original_list = self._detect_regex()
         self._update_processed_text(match_list)

diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py
@@ -1,18 +1,18 @@
 from __future__ import absolute_import
-import collections
-import string
 
+import collections
 import six
+import string
 from six import iteritems
+from six.moves import range
 
 import language_utilities.constant as lang_constant
 from chatbot_ner.config import ner_logger
 from datastore import DataStore
 from lib.nlp.const import TOKENIZER, whitespace_tokenizer
-from lib.nlp.levenshtein_distance import edit_distance
-from ner_v1.detectors.base_detector import BaseDetector
+from lib.nlp.text_normalization import edit_distance
 from ner_constants import ENTITY_VALUE_DICT_KEY
-from six.moves import range
+from ner_v1.detectors.base_detector import BaseDetector
 
 try:
     import regex as re