Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop to Master 21-04-22 #478

Merged
merged 17 commits into from
Apr 21, 2022
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Regex based modifications
tanaya-b committed Mar 23, 2022
commit f008b4251563da781073bdd1cbf43af5edeb4527
49 changes: 0 additions & 49 deletions lib/nlp/levenshtein_distance.py

This file was deleted.

145 changes: 145 additions & 0 deletions lib/nlp/text_normalization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import regex as re
import string
from six.moves import range

from chatbot_ner.config import ner_logger
from ner_v1.detectors.pattern.regex.data.character_constants import CHARACTER_CONSTANTS
from ner_v2.detectors.numeral.constant import NUMBER_DETECTION_RETURN_DICT_VALUE
from ner_v2.detectors.numeral.number.number_detection import NumberDetector

PUNCTUATION_CHARACTERS = list(string.punctuation + '। ')


def edit_distance(string1, string2, insertion_cost=1, deletion_cost=1, substitution_cost=2, max_distance=None):
"""
Calculate the weighted levenshtein distance between two strings

Args:
string1 (unicode): unicode string. If any encoded string type 'str' is passed, it will be decoded using utf-8
string2 (unicode): unicode string. If any encoded string type 'str' is passed, it will be decoded using utf-8
insertion_cost (int, optional): cost penalty for insertion operation, defaults to 1
deletion_cost (int, optional): cost penalty for deletion operation, defaults to 1
substitution_cost (int, optional): cost penalty for substitution operation, defaults to 2
max_distance (int, optional): Stop computing edit distance if it grows larger than this argument.
If None complete edit distance is returned. Defaults to None

For Example:
edit_distance('hello', 'helllo', max_distance=3)
>> 1

edit_distance('beautiful', 'beauty', max_distance=3)
>> 3

NOTE: Since, minimum edit distance is time consuming process, we have defined max_distance attribute.
So, whenever distance exceeds the max_distance the function will break and return the max_distance else
it will return levenshtein distance
"""
if isinstance(string1, bytes):
string1 = string1.decode('utf-8')

if isinstance(string2, bytes):
string2 = string2.decode('utf-8')

if len(string1) > len(string2):
string1, string2 = string2, string1
distances = list(range(len(string1) + 1))
for index2, char2 in enumerate(string2):
new_distances = [index2 + 1]
for index1, char1 in enumerate(string1):
if char1 == char2:
new_distances.append(distances[index1])
else:
new_distances.append(min((distances[index1] + substitution_cost,
distances[index1 + 1] + insertion_cost,
new_distances[-1] + deletion_cost)))
distances = new_distances
if max_distance and min(new_distances) > max_distance:
return max_distance

return distances[-1]


def fit_text_to_format(input_text, regex_pattern, insert_edits=None):
"""
Used to modify text to match the given regex pattern.
Args:
input_text (str): processed string with numerals and character constants fixed
regex_pattern (str): pattern to match
insert_edits (int): number of character deletes allowed for fuzzy matching

Returns:
input_text (str): modified text
"""
if insert_edits:
pattern = f'(?b)({regex_pattern}){{i<={insert_edits}}}'
else:
count = lambda l1, l2: sum([1 for x in l1 if x in l2])
insert_edits = count(input_text, PUNCTUATION_CHARACTERS) + 2
pattern = f'(?b)({regex_pattern}){{i<={insert_edits}}}'
pattern = re.compile(pattern)
matched_format = pattern.search(input_text)
if matched_format:
if any(matched_format.fuzzy_counts):
fuzzy_edits = matched_format.fuzzy_changes[1] # Insert edits are returned at position 1 in the tuple
for index in fuzzy_edits:
input_text = _omit_character_by_index(input_text, index)
return input_text


def _omit_character_by_index(text, index) -> str:
return text[:index] + text[index + 1:]


def resolve_numerals(text) -> str:
"""
Uses NumberDetector to resolve numeric occurrences in text for both English and Hindi.
Args:
text (str): processed string with numerals and character constants fixed
Returns:
processed_text (str): modified text
"""
processed_text = text
number_detector = NumberDetector('asr_dummy', language='en')
detected_numerals, original_texts = number_detector.detect_entity(text=text)
detected_numerals_hi, original_texts_hi = number_detector.detect_entity(text=text, language='hi')
for number, original_text in zip(detected_numerals.extend(detected_numerals_hi),
original_texts.extend(original_texts_hi)):
substitution_reg = re.compile(re.escape(original_text), re.IGNORECASE)
processed_text = substitution_reg.sub(number[NUMBER_DETECTION_RETURN_DICT_VALUE], processed_text)
return processed_text


def resolve_characters(text) -> str:
"""
Uses a dictionary to resolve hindi character occurrences in text to English.
Args:
text (str): processed string with numerals fixed
Returns:
processed_text (str): modified text
"""
processed_text = text
occurrences = []
for char in CHARACTER_CONSTANTS.keys():
if char in text:
occurrences.append(char)
for fragment in sorted(occurrences, key=len):
processed_text = processed_text.replace(fragment, CHARACTER_CONSTANTS[fragment])
return processed_text


def perform_asr_correction(input_text, regex_pattern):
"""
Main function for text normalization for ASR retrieved input.
Performs resolution for numerics and characters
and uses fuzzy matching to modify text as per the RegEx provided.

Args:
input_text (str): original text (as per ASR engine output)
regex_pattern (str): Regex pattern to match
Returns:
processed_text (str): modified text
"""
processed_text = resolve_numerals(input_text)
processed_text = resolve_characters(processed_text)
processed_text = fit_text_to_format(processed_text, regex_pattern)
return processed_text
1 change: 1 addition & 0 deletions ner_constants.py
Original file line number Diff line number Diff line change
@@ -48,6 +48,7 @@
PARAMETER_BOT_MESSAGE = 'bot_message'
PARAMETER_TIMEZONE = 'timezone'
PARAMETER_REGEX = 'regex'
PARAMETER_ASR = 'is_asr'
PARAMETER_PAST_DATE_REFERENCED = 'past_date_referenced'
PARAMETER_RANGE_ENABLED = 'range_enabled'

11 changes: 6 additions & 5 deletions ner_v1/api.py
Original file line number Diff line number Diff line change
@@ -2,9 +2,10 @@

import ast
import json

import six
from django.http import HttpResponse
from django.views.decorators.csrf import csrf_exempt
from django.views.decorators.http import require_http_methods
from elasticsearch import exceptions as es_exceptions

from chatbot_ner.config import ner_logger
@@ -13,7 +14,6 @@
from ner_constants import (PARAMETER_MESSAGE, PARAMETER_ENTITY_NAME, PARAMETER_STRUCTURED_VALUE,
PARAMETER_FALLBACK_VALUE, PARAMETER_BOT_MESSAGE, PARAMETER_TIMEZONE, PARAMETER_REGEX,
PARAMETER_LANGUAGE_SCRIPT, PARAMETER_SOURCE_LANGUAGE, PARAMETER_PRIOR_RESULTS)

from ner_v1.chatbot.combine_detection_logic import combine_output_of_detection_logic_and_tag
from ner_v1.chatbot.entity_detection import (get_location, get_phone_number, get_email, get_city, get_pnr,
get_number, get_passenger_count, get_shopping_size, get_time,
@@ -22,8 +22,6 @@
from ner_v1.chatbot.tag_message import run_ner
from ner_v1.constant import (PARAMETER_MIN_TOKEN_LEN_FUZZINESS, PARAMETER_FUZZINESS, PARAMETER_MIN_DIGITS,
PARAMETER_MAX_DIGITS)
from django.views.decorators.csrf import csrf_exempt
from django.views.decorators.http import require_http_methods


def to_bool(value):
@@ -62,6 +60,7 @@ def get_parameters_dictionary(request):
PARAMETER_BOT_MESSAGE: request.GET.get('bot_message'),
PARAMETER_TIMEZONE: request.GET.get('timezone'),
PARAMETER_REGEX: request.GET.get('regex'),
PARAMETER_ASR: request.GET.get('is_asr', False),
PARAMETER_LANGUAGE_SCRIPT: request.GET.get('language_script', ENGLISH_LANG),
PARAMETER_SOURCE_LANGUAGE: request.GET.get('source_language', ENGLISH_LANG),
PARAMETER_FUZZINESS: request.GET.get('fuzziness'),
@@ -94,6 +93,7 @@ def parse_post_request(request):
PARAMETER_BOT_MESSAGE: request_data.get('bot_message'),
PARAMETER_TIMEZONE: request_data.get('timezone'),
PARAMETER_REGEX: request_data.get('regex'),
PARAMETER_ASR: request_data.get('is_asr'),
PARAMETER_LANGUAGE_SCRIPT: request_data.get('language_script', ENGLISH_LANG),
PARAMETER_SOURCE_LANGUAGE: request_data.get('source_language', ENGLISH_LANG),
PARAMETER_FUZZINESS: request_data.get('fuzziness'),
@@ -345,7 +345,8 @@ def regex(request):
parameters_dict[PARAMETER_STRUCTURED_VALUE],
parameters_dict[PARAMETER_FALLBACK_VALUE],
parameters_dict[PARAMETER_BOT_MESSAGE],
parameters_dict[PARAMETER_REGEX])
parameters_dict[PARAMETER_REGEX],
parameters_dict[PARAMETER_ASR])
ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output))
except TypeError as e:
ner_logger.exception('Exception for regex: %s ' % e)
4 changes: 2 additions & 2 deletions ner_v1/chatbot/entity_detection.py
Original file line number Diff line number Diff line change
@@ -621,7 +621,7 @@ def get_pnr(message, entity_name, structured_value, fallback_value, bot_message)
bot_message=bot_message)


def get_regex(message, entity_name, structured_value, fallback_value, bot_message, pattern):
def get_regex(message, entity_name, structured_value, fallback_value, bot_message, pattern, is_asr=False):
"""Use RegexDetector to detect text that abide by the specified
pattern.
The meta_data consists the pattern
@@ -658,7 +658,7 @@ def get_regex(message, entity_name, structured_value, fallback_value, bot_messag
>> [{'detection': 'message', 'original_text': '123', 'entity_value': {'value': '123'}}]

"""
regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern)
regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern, enable_asr=is_asr)
if structured_value:
entity_list, original_text_list = regex_detector.detect_entity(text=structured_value)
if entity_list:
Empty file.
35 changes: 35 additions & 0 deletions ner_v1/detectors/pattern/regex/data/character_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
CHARACTER_CONSTANTS = {
"ए": "A",
"ब": "B",
"सी": "C",
"डी": "D",
"इ": "E",
"ई": "E",
"एफ": "F",
"ऍफ़": "F",
"जी": "G",
"एच": "H",
"आय": "I",
"जे": "J",
"के": "K",
"एल": "L",
"एम्": "M",
"एम": "M",
"एन": "N",
"ओ": "O",
"पी": "P",
"क्यू": "Q",
"आर": "R",
"एस": "S",
"टी": "T",
"यु": "U",
"वि": "V",
"वी": "V",
"डब्ल्यू": "W",
"डब्ल्यु": "W",
"डबल्यू": "W",
"एक्स": "X",
"वाय": "Y",
"ज़ेड": "Z",
"ज़ी": "Z",
}
10 changes: 8 additions & 2 deletions ner_v1/detectors/pattern/regex/regex_detection.py
Original file line number Diff line number Diff line change
@@ -10,6 +10,7 @@
from typing import List

from chatbot_ner.config import ner_logger
from lib.nlp.text_normalization import perform_asr_correction

try:
import regex as re
@@ -41,7 +42,7 @@ class RegexDetector(object):
pattern (raw str or str or unicode): pattern to be compiled into a re object
"""

def __init__(self, entity_name, pattern, re_flags=DEFAULT_FLAGS, max_matches=50):
def __init__(self, entity_name, pattern, enable_asr=False, re_flags=DEFAULT_FLAGS, max_matches=50):
"""
Args:
entity_name (str): an indicator value as tag to replace detected values
@@ -57,6 +58,8 @@ def __init__(self, entity_name, pattern, re_flags=DEFAULT_FLAGS, max_matches=50)
self.text = ''
self.tagged_text = ''
self.processed_text = ''
self.enable_asr = enable_asr
self.uncompiled_pattern = pattern
try:
self.pattern = re.compile(pattern, flags=re_flags)
except re.error:
@@ -95,7 +98,10 @@ def detect_entity(self, text):

"""
self.text = text
self.processed_text = self.text
if self.enable_asr:
self.processed_text = perform_asr_correction(self.text, self.uncompiled_pattern)
else:
self.processed_text = self.text
self.tagged_text = self.text
match_list, original_list = self._detect_regex()
self._update_processed_text(match_list)
10 changes: 5 additions & 5 deletions ner_v1/detectors/textual/text/text_detection.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from __future__ import absolute_import
import collections
import string

import collections
import six
import string
from six import iteritems
from six.moves import range

import language_utilities.constant as lang_constant
from chatbot_ner.config import ner_logger
from datastore import DataStore
from lib.nlp.const import TOKENIZER, whitespace_tokenizer
from lib.nlp.levenshtein_distance import edit_distance
from ner_v1.detectors.base_detector import BaseDetector
from lib.nlp.text_normalization import edit_distance
from ner_constants import ENTITY_VALUE_DICT_KEY
from six.moves import range
from ner_v1.detectors.base_detector import BaseDetector

try:
import regex as re
Loading