diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 000000000..fa58ffe57 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,41 @@ +[run] +source = . +omit = + *.pyc + *.pyo + */site-packages/* + */distutils/* + docs/ + docker/ + logs/ + postman_tests/ + */tests/* + */test.py + */tests.py + manage.py + */settings.py + */urls.py + */migrations/* + *wsgi.py + *__init__.py + +[report] +skip_empty = True +sort = Cover +exclude_lines = + pragma: no cover + + # Don't complain about missing debug-only code: + def __repr__ + if self\.debug + + # Don't complain if tests don't hit defensive assertion code: + raise AssertionError + raise NotImplementedError + + # Don't complain if non-runnable code isn't run: + if 0: + if __name__ == .__main__.: + + __author__ = 'haptik' +show_missing = True \ No newline at end of file diff --git a/.gitignore b/.gitignore index 89c33629c..4d72385b5 100644 --- a/.gitignore +++ b/.gitignore @@ -103,7 +103,7 @@ ENV/ /newrelic.ini sftp-config.json .DS_Store -logs/ +logs/*.log* .vscode newman_reports/ diff --git a/chatbot_ner/config.py b/chatbot_ner/config.py index 0720b40c8..ffd1820fb 100644 --- a/chatbot_ner/config.py +++ b/chatbot_ner/config.py @@ -13,14 +13,10 @@ LOG_PATH = os.path.join(BASE_DIR, 'logs') # TODO: Set this up via Django LOGGING -# SET UP NER LOGGING -if not os.path.exists(LOG_PATH): - os.makedirs(LOG_PATH) - LOG_LEVEL = os.environ.get('DJANGO_LOG_LEVEL', 'error').upper() # Common formatter -formatter = logging.Formatter("%(asctime)s\t%(levelname)s\t%(message)s", "%Y-%m-%d %H:%M:%S") +formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s %(module)s:%(lineno)d") # Handler for Docker stdout handler_stdout = logging.StreamHandler() @@ -29,28 +25,14 @@ # SETUP NER LOGGING NER_LOG_FILENAME = os.path.join(LOG_PATH, 'ner_log.log') -# Set up a specific logger with our desired output level -ner_logger = logging.getLogger('NERLogger') -ner_logger.setLevel(LOG_LEVEL) -# Add the log message handler to the logger handler = logging.handlers.WatchedFileHandler(NER_LOG_FILENAME) -# handler = logging.handlers.RotatingFileHandler(NER_LOG_FILENAME, maxBytes=10 * 1024 * 1024, backupCount=5) handler.setFormatter(formatter) + +ner_logger = logging.getLogger('NERLogger') +ner_logger.setLevel(LOG_LEVEL) ner_logger.addHandler(handler) ner_logger.addHandler(handler_stdout) -# SETUP NLP LIB LOGGING -NLP_LIB_LOG_FILENAME = os.path.join(LOG_PATH, 'nlp_log.log') -# Set up a specific logger with our desired output level -nlp_logger = logging.getLogger('NLPLibLogger') -nlp_logger.setLevel(LOG_LEVEL) -# Add the log message handler to the logger -handler = logging.handlers.WatchedFileHandler(NLP_LIB_LOG_FILENAME) -# handler = logging.handlers.RotatingFileHandler(NLP_LIB_LOG_FILENAME, maxBytes=10 * 1024 * 1024, backupCount=5) -handler.setFormatter(formatter) -nlp_logger.addHandler(handler) -nlp_logger.addHandler(handler_stdout) - ENGINE = os.environ.get('ENGINE') # ES settings (Mandatory to use Text type entities) ES_SCHEME = os.environ.get('ES_SCHEME', 'http') diff --git a/chatbot_ner/settings.py b/chatbot_ner/settings.py index 84b9ff273..f8d8c77af 100755 --- a/chatbot_ner/settings.py +++ b/chatbot_ner/settings.py @@ -10,6 +10,7 @@ # Build paths inside the project like this: os.path.join(BASE_DIR, ...) from __future__ import absolute_import + import os import sys @@ -119,27 +120,25 @@ def __getitem__(self, item): 'CONN_MAX_AGE': 60 } -# MIGRATION_MODULES = DisableMigrations() - - TEST_RUNNER = 'django_nose.NoseTestSuiteRunner' NOSE_ARGS = [ '--nocapture', '--nologcapture', '--verbosity=3', - '--ignore-files=urls.py', - '--ignore-files=wsgi.py', + '--exclude-dir=chatbot_ner/', + '--exclude-dir=docs/', + '--exclude-dir=docker/', + '--exclude-dir=data/', '--ignore-files=manage.py', '--ignore-files=nltk_setup.py', '--ignore-files=__init__.py', '--ignore-files=const.py', '--ignore-files=constant.py', '--ignore-files=constants.py', - '--ignore-files=settings.py', '--ignore-files=run_postman_tests.py', - '--exclude-dir=docs/', - '--exclude-dir=docker/', - '--exclude-dir=data/', + '--cover-erase', + '--cover-package=datastore,external_api,language_utilities,lib,models,ner_v1,ner_v2', + '--cover-inclusive', ] # Internationalization diff --git a/chatbot_ner/setup_sentry.py b/chatbot_ner/setup_sentry.py index 9446464a0..4b8e53a0a 100644 --- a/chatbot_ner/setup_sentry.py +++ b/chatbot_ner/setup_sentry.py @@ -9,8 +9,8 @@ # Support for Sentry DSN SENTRY_DSN = os.environ.get('SENTRY_DSN') -SENTRY_ENABLED = os.environ.get('SENTRY_ENABLED') -SENTRY_ENABLED = True if SENTRY_ENABLED == 'True' and 'test' not in sys.argv else False +_sentry_enabled = (os.environ.get('SENTRY_ENABLED') or '').strip().lower() +SENTRY_ENABLED = (_sentry_enabled == 'true' and 'test' not in sys.argv) def setup_sentry(): diff --git a/logs/.gitkeep b/logs/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/ner_v1/detectors/textual/name/hindi_const.py b/ner_v1/detectors/textual/name/lang_constants.py similarity index 100% rename from ner_v1/detectors/textual/name/hindi_const.py rename to ner_v1/detectors/textual/name/lang_constants.py diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index bbb0dcd64..31ef23640 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -9,14 +9,13 @@ from language_utilities.constant import (ENGLISH_LANG, INDIC_LANGUAGES_SET, EUROPEAN_LANGUAGES_SET) from ner_v1.constant import DATASTORE_VERIFIED, MODEL_VERIFIED from ner_v1.constant import EMOJI_RANGES, FIRST_NAME, MIDDLE_NAME, LAST_NAME -from ner_v1.detectors.textual.name.hindi_const import (INDIC_BADWORDS, INDIC_QUESTIONWORDS, - INDIC_STOPWORDS, NAME_VARIATIONS, INDIC_UNICODE_RANGE, - COMMON_INDIC_WORDS_OCCURRING_WITH_NAME) +from ner_v1.detectors.textual.name.lang_constants import (INDIC_BADWORDS, INDIC_QUESTIONWORDS, + INDIC_STOPWORDS, NAME_VARIATIONS, INDIC_UNICODE_RANGE, + COMMON_INDIC_WORDS_OCCURRING_WITH_NAME) from six.moves import range # TODO: Refactor this module for readability and useability. Remove any hacks -# TODO: Make this module python 3 compatible class NameDetector(object): """ @@ -62,21 +61,24 @@ def get_format_name(name_tokens, text): 2.The original text. Args: - name_tokens (list): List of tokens in the name - Example: - ['yash', 'doshi'] + name_tokens (list): List of tokens in the name. e.g. ['yash', 'doshi'] Returns: - ( - [{first_name: "yash", middle_name: None, last_name: "doshi"}], - ["yash modi"] - ) + (list, list): tuple containing + list: list of dictionaries, one for each detected name + list: list of str, the original text span for each detected name + + Examples: + >>> NameDetector.get_format_name(['yash', 'p.', 'm.', 'doshi'], 'my name is yash p. m. doshi') + ([{first_name: 'yash', middle_name: 'p. m.', last_name: 'doshi'}], + ['yash p. m. doshi']) """ entity_value = [] original_text = [] + if not name_tokens: + return entity_value, original_text name_text = " ".join(name_tokens) - first_name = name_tokens[0] middle_name = None last_name = None @@ -166,7 +168,7 @@ def detect_entity(self, text, bot_message=None, predetected_values=None, **kwarg if self.language in EUROPEAN_LANGUAGES_SET | {ENGLISH_LANG}: entity_value, original_text = self.detect_english_name() elif self.language in INDIC_LANGUAGES_SET: - entity_value, original_text = self.detect_hindi_name() + entity_value, original_text = self.detect_indic_name() for entity_value_dict in entity_value: entity_value_dict.update({DATASTORE_VERIFIED: True, MODEL_VERIFIED: False}) @@ -201,7 +203,7 @@ def detect_english_name(self, text=None): entity_value, original_text = self.get_name_using_pos_tagger(text) return entity_value, original_text - def detect_hindi_name(self): + def detect_indic_name(self): """ This method is used to detect Hindi names from the provided text @@ -216,7 +218,7 @@ def detect_hindi_name(self): >> [{first_name: u"प्रतिक", middle_name: u"श्रीदत्त", last_name: u"जयराओ"}], [ u'प्रतिक श्रीदत्त जयराओ'] """ - if self.detect_abusive_phrases_hindi(text=self.text) or self.detect_question_hindi(text=self.text): + if self.detect_abusive_phrases_indic(text=self.text) or self.detect_question_indic(text=self.text): return [], [] text = self.remove_emojis(text=self.text) @@ -224,7 +226,7 @@ def detect_hindi_name(self): regex = re.compile(u'[^{unicode_range}\\s]+'.format(unicode_range=INDIC_UNICODE_RANGE[self.language]), re.U) text = regex.sub(string=text, repl='') - entity_value, original_text = self.get_hindi_names_without_regex(text=text) + entity_value, original_text = self.get_indic_names_without_regex(text=text) # Further check for name, if it might have been written in latin script. if not entity_value: english_present_regex = re.compile(u'[a-zA-Z]+', re.U) @@ -364,6 +366,7 @@ def detect_person_name_entity(self, replaced_text): def context_check_botmessage(self, botmessage): """ Checks if previous botmessage conatins name as a keyword or not + Args: botmessage: it consists of the previous botmessage @@ -377,12 +380,12 @@ def context_check_botmessage(self, botmessage): botmessage = regex_pattern.sub(r'', botmessage) botmessage = " " + botmessage.lower().strip() + " " - for variant in NAME_VARIATIONS[self.language]: + for variant in NAME_VARIATIONS.get(self.language, []): if " " + variant + " " in botmessage: return True return False - def get_hindi_names_without_regex(self, text): + def get_indic_names_without_regex(self, text): """ This method is used to get detect hindi names without any regex pattern (This method is called only if detection from regex patterns fails) @@ -430,7 +433,7 @@ def replace_stopwords_hindi(self, text): return "" - def detect_abusive_phrases_hindi(self, text): + def detect_abusive_phrases_indic(self, text): """ This method is used to check for hindi abuses in the sentence Args: @@ -457,7 +460,7 @@ def remove_emojis(self, text): text = emoji_pattern.sub(repl='', string=text) return text - def detect_question_hindi(self, text): + def detect_question_indic(self, text): """ This method is used to detect if the given text has a hindi question present in it Args: diff --git a/ner_v1/static/index.html b/ner_v1/static/index.html deleted file mode 100644 index 44f8fe0ab..000000000 --- a/ner_v1/static/index.html +++ /dev/null @@ -1,155 +0,0 @@ - - - - Haptik NER - - - - - - - - - - - - - - - -
-

CHATBOT NER GUI

-
- - - - - - - - -
-
- - -
- -
-
- Entity Type -
- - - -
- -
- -
- Entity Name -
- - -
- -
- -
- Message -
- -
- -
-
- -
- -
- Bot Message -
- -
- -
-
- -
- -
- Structured Value -
- -
- -
-
- -
- -
- Fallback Value -
- -
- -
-
- - - Detect! - -
- -
	  			
-	  		
-
- - - - - - - - -
- - - diff --git a/ner_v1/static/ner_dashboard.css b/ner_v1/static/ner_dashboard.css deleted file mode 100644 index b2fc4a4ec..000000000 --- a/ner_v1/static/ner_dashboard.css +++ /dev/null @@ -1,45 +0,0 @@ -.top-header{ - text-align: center; - background: #4db9e6; - font-family: 'latobold'; - font-size: 44px; - color: white; - padding: 10px; -} - -.inputlabel{ - //margin-right: 50px; - display: inline-block; - height: 34px; - padding-top: 8px; -} - -.dropdown{ - padding-left: 0px !important; -} -.tab-pane{ - padding-top: 30px; - padding-left: 30px; -} - -.input-group, #entitynamefield{ - padding-top: 10px; -} - -#entitynamefield{ - padding-top: 10px; -} - -#entitysubmitbtn{ - margin-top: 10px; -} - -pre {outline: 1px solid #ccc; padding: 5px; margin: 5px; } -.string { color: green; } -.number { color: darkorange; } -.boolean { color: blue; } -.null { color: magenta; } -.key { color: red; } - - - diff --git a/ner_v1/static/ner_dashboard.js b/ner_v1/static/ner_dashboard.js deleted file mode 100644 index cd5348336..000000000 --- a/ner_v1/static/ner_dashboard.js +++ /dev/null @@ -1,90 +0,0 @@ -$(document).ready(function(){ - - $("#typedropdown li a").click(function(event){ - $(this).parent().parent().siblings(".btn:first-child").html($(this).text()+' '); - var entitytype = $(this).text(); - if (entitytype == "Text"){ - $("#entitynamefield").show(); - $("#entitynamefield").empty().html(""); - $("#entitynamefield li a").click(function(event){ - $(this).parent().parent().siblings(".btn:first-child").html($(this).text()+' '); - }); - } - else{ - $("#entitynamefield").show(); - $("#entitynamefield").empty().html("
") - } - - }); - - - - $("#entitysubmitbtn").click(function(){ - var entityType = $("#entitytypes").text(); - var entityName = $("#entitynames").text(); - if (!entityName){ - entityName = $("#entitynameinput").val(); - } - var structuredValue = $("#structuredvalue").val(); - var botMessage = $("#botmessage").val(); - var message = $("#message").val(); - var fallbackValue = $("#fallbackvalue").val(); - - var entityUrl = "/v1/" + entityType.trim().toLowerCase().replace(/ /g,"_") + "/"; - - $.ajax({ - url: entityUrl, - type: "get", - contentType:"application/json", - data: { - message: message.trim(), - entity_name: entityName.trim().toLowerCase(), - structured_value: structuredValue.trim(), - bot_message: botMessage.trim(), - fallback_value: fallbackValue.trim(), - - }, - success: function(data, textStatus, XmlHttpRequest){ - var str = JSON.stringify(data, undefined, 4); - output(syntaxHighlight(str)); - }, - error: function(xhr, a, b){ - var errorMessage = 'Oops! Something went wonrg, please check your input data'; - output(errorMessage); - } - - }); - - }); - - var obj = {a:1, 'b':'foo', c:[false,'false',null, 'null', {d:{e:1.3e5,f:'1.3e5'}}]}; - var str = JSON.stringify(obj, undefined, 4); - -}); - -function output(inp) { - $("#entityoutput").html(inp); -} - -/* - Function to sytax highlight the JSON. Gotten by stack overflow answer - https://stackoverflow.com/questions/4810841/how-can-i-pretty-print-json-using-javascript -*/ -function syntaxHighlight(json) { - json = json.replace(/&/g, '&').replace(//g, '>'); - return json.replace(/("(\\u[a-zA-Z0-9]{4}|\\[^u]|[^\\"])*"(\s*:)?|\b(true|false|null)\b|-?\d+(?:\.\d*)?(?:[eE][+\-]?\d+)?)/g, function (match) { - var cls = 'number'; - if (/^"/.test(match)) { - if (/:$/.test(match)) { - cls = 'key'; - } else { - cls = 'string'; - } - } else if (/true|false/.test(match)) { - cls = 'boolean'; - } else if (/null/.test(match)) { - cls = 'null'; - } - return '' + match + ''; - }); -} diff --git a/ner_v2/api.py b/ner_v2/api.py index c863c44ae..864f74062 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -1,25 +1,25 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import + +import json + +import six +from django.http import HttpResponse, JsonResponse +from django.views.decorators.csrf import csrf_exempt + from chatbot_ner.config import ner_logger +from language_utilities.constant import ENGLISH_LANG from ner_constants import PARAMETER_MESSAGE, PARAMETER_ENTITY_NAME, PARAMETER_STRUCTURED_VALUE, \ PARAMETER_FALLBACK_VALUE, \ PARAMETER_BOT_MESSAGE, PARAMETER_TIMEZONE, PARAMETER_LANGUAGE_SCRIPT, PARAMETER_SOURCE_LANGUAGE, \ PARAMETER_PAST_DATE_REFERENCED, PARAMETER_MIN_DIGITS, PARAMETER_MAX_DIGITS, PARAMETER_NUMBER_UNIT_TYPE, \ PARAMETER_LOCALE, PARAMETER_RANGE_ENABLED - -from ner_v2.detectors.temporal.date.date_detection import DateAdvancedDetector -from ner_v2.detectors.temporal.time.time_detection import TimeDetector from ner_v2.detectors.numeral.number.number_detection import NumberDetector from ner_v2.detectors.numeral.number_range.number_range_detection import NumberRangeDetector - -from ner_v2.detectors.textual.utils import get_text_entity_detection_data, verify_text_request -from language_utilities.constant import ENGLISH_LANG from ner_v2.detectors.pattern.phone_number.phone_number_detection import PhoneDetector - -from django.views.decorators.csrf import csrf_exempt -from django.http import HttpResponse -import json -import six +from ner_v2.detectors.temporal.date.date_detection import DateAdvancedDetector +from ner_v2.detectors.temporal.time.time_detection import TimeDetector +from ner_v2.detectors.textual.utils import get_text_entity_detection_data, verify_text_request def get_parameters_dictionary(request): @@ -163,7 +163,7 @@ def date(request): ner_logger.exception('Exception for date: %s ' % e) return HttpResponse(status=500) - return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json') + return JsonResponse({'data': entity_output}) @csrf_exempt @@ -242,7 +242,7 @@ def time(request): ner_logger.exception('Exception for time: %s ' % e) return HttpResponse(status=500) - return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json') + return JsonResponse({'data': entity_output}) @csrf_exempt @@ -338,7 +338,7 @@ def number(request): ner_logger.exception('Exception for numeric: %s ' % e) return HttpResponse(status=500) - return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json') + return JsonResponse({'data': entity_output}) @csrf_exempt @@ -409,7 +409,7 @@ def number_range(request): ner_logger.exception('Exception for numeric: %s ' % e) return HttpResponse(status=500) - return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json') + return JsonResponse({'data': entity_output}) @csrf_exempt @@ -554,7 +554,7 @@ def phone_number(request): ner_logger.exception('Exception for phone_number: %s ' % e) return HttpResponse(status=500) - return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json') + return JsonResponse({'data': entity_output}) @csrf_exempt @@ -668,7 +668,7 @@ def text(request): if request.method == "GET": response = {"success": False, "error": "Get method is not allowed"} - return HttpResponse(json.dumps(response), status=501) + return JsonResponse(response, status=405) elif request.method == "POST": ner_logger.debug("Fetching result") @@ -682,22 +682,18 @@ def text(request): response = {"success": False, "error": str(err)} # TODO: move to ner_logger.error ner_logger.exception(response) - return HttpResponse(json.dumps(response), content_type='application/json', - status=400) + return JsonResponse(response, status=400) except TypeError as err: response = {"success": False, "error": str(err)} ner_logger.exception(response) - return HttpResponse(json.dumps(response), content_type='application/json', - status=400) + return JsonResponse(response, status=400) except Exception as err: response = {"success": False, "error": str(err)} ner_logger.exception(response) - return HttpResponse(json.dumps(response), content_type='application/json', - status=400) - + return JsonResponse(response, status=500) if data: response = {"success": True, "error": None, "data": data} - return HttpResponse(json.dumps(response), content_type='application/json', status=200) + return JsonResponse(response, status=200) else: response = {"success": False, "error": "Some error while parsing"} - return HttpResponse(json.dumps(response), status=400) + return JsonResponse(response, status=500) diff --git a/ner_v2/detectors/numeral/number/hi/data/numerals_constant.csv b/ner_v2/detectors/numeral/number/hi/data/numerals_constant.csv index 47e0a4922..483b8de05 100644 --- a/ner_v2/detectors/numeral/number/hi/data/numerals_constant.csv +++ b/ner_v2/detectors/numeral/number/hi/data/numerals_constant.csv @@ -5,51 +5,51 @@ number,name_variants,number_value,number_type २,दो|do|du,2,unit २.५,ढाई|ढ़ाई|धाइ|धाई|dhaai|daai|dhai|dai,2.5,unit ३,तीन|teen|tin,3,unit -४,चार |char|chaar,4,unit +४,चार|char|chaar,4,unit ५,पाँच|पांच|panch|paanch|paach,5,unit -६,छह |chhe|chhah|chheh,6,unit -७,सात |saat,7,unit -८,आठ |ath|aath,8,unit -९,नौ |nau|nao,9,unit -१०,दस | dus|das,10,unit -११,ग्यारह |gyareh|gyarah,11,unit -१२,बारह | bareh|barah,12,unit -१३,तेरह | terah|tereh,13,unit -१४,चौदह |chaudeh|chaudah|chauda,14,unit -१५,पन्द्रह | pandreh | pandrah|pehdrah|pendreh,15,unit -१६,सोलह |solah|soleh|sholah|sholeh,16,unit -१७,सत्रह |satreh|starah,17,unit -१८,अठारह | athrah|athreh|aththarah|aththareh,18,unit -१९,उन्नीस | unnis| unnish,19,unit -२०,बीस | bis|bish|bees|beesh,20,unit -२१,इक्कीस | ikkis|ikkish|ekkis|ekkish,21,unit -२२,बाईस | bais| baish|bayis|bayish,22,unit -२३,तेईस | teis|teish|teyis|teyish,23,unit -२४,चौबीस | chaubis|chaubish|chaubees|chaubeesh,24,unit -२५,पच्चीस |pachis|pachish|pachchis|pachchish|pachees|pacheesh|pachchees|pachcheesh,25,unit -२६,छब्बीस | chhabis|chhabish|chhabees|chhabeesh|chhabbis|chhabbish|chhabbees|chhabbeesh,26,unit -२७,सत्ताईस | sattais|sattaish|sattaees|sattaeesh,27,unit -२८,अट्ठाईस | athais|athaish|athaees|athaeesh|aththais|aththaish|aththaees|aththaeesh,28,unit -२९,उनतीस | untis|untish|untees|unteesh,29,unit -३०,तीस | tis| tish|tees|teesh,30,unit -३१,इकतीस |ikkatis|ikkatish|ikattis|ikattish|ikkattis|ikkattish|ekkatis|ekkatish|ekattis|ekattish|ekkattis|ekkattish,31,unit +६,छह|chhe|chhah|chheh,6,unit +७,सात|saat,7,unit +८,आठ|ath|aath,8,unit +९,नौ|nau|nao,9,unit +१०,दस| dus|das,10,unit +११,ग्यारह|gyareh|gyarah,11,unit +१२,बारह| bareh|barah,12,unit +१३,तेरह| terah|tereh,13,unit +१४,चौदह|chaudeh|chaudah|chauda,14,unit +१५,पन्द्रह| pandreh | pandrah|pehdrah|pendreh,15,unit +१६,सोलह|solah|soleh|sholah|sholeh,16,unit +१७,सत्रह|satreh|starah,17,unit +१८,अठारह| athrah|athreh|aththarah|aththareh,18,unit +१९,उन्नीस| unnis| unnish,19,unit +२०,बीस| bis|bish|bees|beesh,20,unit +२१,इक्कीस| ikkis|ikkish|ekkis|ekkish,21,unit +२२,बाईस| bais| baish|bayis|bayish,22,unit +२३,तेईस| teis|teish|teyis|teyish,23,unit +२४,चौबीस| chaubis|chaubish|chaubees|chaubeesh,24,unit +२५,पच्चीस|pachis|pachish|pachchis|pachchish|pachees|pacheesh|pachchees|pachcheesh,25,unit +२६,छब्बीस| chhabis|chhabish|chhabees|chhabeesh|chhabbis|chhabbish|chhabbees|chhabbeesh,26,unit +२७,सत्ताईस| sattais|sattaish|sattaees|sattaeesh,27,unit +२८,अट्ठाईस| athais|athaish|athaees|athaeesh|aththais|aththaish|aththaees|aththaeesh,28,unit +२९,उनतीस| untis|untish|untees|unteesh,29,unit +३०,तीस| tis| tish|tees|teesh,30,unit +३१,इकतीस|ikkatis|ikkatish|ikattis|ikattish|ikkattis|ikkattish|ekkatis|ekkatish|ekattis|ekattish|ekkattis|ekkattish,31,unit ३२,बत्तीस| batis|batish|battis|battish|batees|bateesh|battees|batteesh,32,unit ३३,तैंतीस|taitis|taitish|taitees|taiteesh|taintis|taintish|taintees|tainteesh,33,unit -३४,चौंतीस | chautis|chautish|chautees|chauteesh|chauntis|chauntish|chauntees|chaunteesh,34,unit +३४,चौंतीस| chautis|chautish|chautees|chauteesh|chauntis|chauntish|chauntees|chaunteesh,34,unit ३५,पैंतीस|paitis|paitish|paitees|paiteesh|paintis|paintish|paintees|painteesh,35,unit ३६,छत्तीस|chhatis|chhatish|chhatees|chhateesh|chhattis|chhattish|chhattees|chhatteesh,36,unit ३७,सैंतीस|saitis|saitish|saitees|saiteesh|saintis|saintish|saintees|sainteesh,37,unit ३८,अड़तीस|adtis|adtish|adtees|adteesh,38,unit -३९,उनतालीस |unchalis|unchalish|unchalees|unchaleesh,39,unit +३९,उनतालीस|unchalis|unchalish|unchalees|unchaleesh,39,unit ४०,चालीस|chalis|chalish|chalees|chaleesh,40,unit ४१,इकतालीस|iktalis|iktalish|iktalees|iktaleesh|ektalis|ektalish|ektalees|ektaleesh,41,unit ४२,बयालीस|bayalis|bayalish|bayalees|bayaleesh,42,unit ४३,तैंतालीस|taitalis|taitalish|taitalees|taitaleesh|taintalis|taintalish|taintalees|taintaleesh,43,unit ४४,चौंतालीस|चौवालिश|चौवालिस|chautalis|chautalish|chautalees|chautaleesh|chauntalis|chauntalish|chauntalees|chauntaleesh|chauvalis|chauvalish|chauvalees|chauvaleesh,44,unit -४५,पैंतालीस |paitalis|paitalish|paitalees|paitaleesh|paintalis|paintalish|paintalees|paintaleesh,45,unit +४५,पैंतालीस|paitalis|paitalish|paitalees|paitaleesh|paintalis|paintalish|paintalees|paintaleesh,45,unit ४६,छियालीस|chhiyalis|chhiyalish|chhiyalees|chhiyaleesh|chhialis|chhialish|chhialees|chhialeesh,46,unit -४७,सैंतालीस |saitalis|saitalish|saitalees|saitaleesh|saintalis|saintalish|saintalees|saintaleesh,47,unit -४८,अड़तालीस |adtalis|adtalish|adtalees|adtaleesh,48,unit +४७,सैंतालीस|saitalis|saitalish|saitalees|saitaleesh|saintalis|saintalish|saintalees|saintaleesh,47,unit +४८,अड़तालीस|adtalis|adtalish|adtalees|adtaleesh,48,unit ४९,उनचास|unchaas|unchaash|unchas|unchash,49,unit ५०,पचास|pachas|pachash|pachaas|pachaash,50,unit ५१,इक्याबन|इक्याबन|ikyavan|ikyawan|ekyavan|ekyawan,51,unit @@ -102,6 +102,6 @@ number,name_variants,number_value,number_type ९८,अट्ठानवे|aththanve|aththanwe|aththanave|aththanawe|athanve|athanwe|athanave|athanawe,98,unit ९९,निन्यानवे|निन्यानबे|ninyanbe|ninyanabe|ninyanve|ninyanave|ninyanwe|ninyanawe,99,unit १००,सौ|sau|sao,100,scale -१०००,हज़ार|हजार|hajar|hajaar|hazar|hazaar,1000,scale +१०००,हज़ार|हजार|hajar|hajaar|hazar|hazaar|k,1000,scale १०००००,लाख|lakh|laakh|lac,100000,scale १०००००००,करोड़|crore|karor|caror,10000000,scale diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index b7e86b463..0c454f64e 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -1,8 +1,10 @@ # coding=utf-8 from __future__ import absolute_import -import pandas as pd + import collections import os + +import pandas as pd from six.moves import zip try: @@ -28,6 +30,8 @@ class BaseNumberDetector(object): + _SPAN_BOUNDARY_TEMPLATE = r'(?:^|(?<=[\s\"\'\,\-\?])){}(?=[\s\!\"\%\'\,\?\.\-]|$)' + def __init__(self, entity_name, data_directory_path, unit_type=None): """ Standard Number detection class, read data from language data path and help to detect number and numbers words @@ -62,8 +66,7 @@ def __init__(self, entity_name, data_directory_path, unit_type=None): # Variable to define default order in which detector will work self.detector_preferences = [self._detect_number_from_digit, - self._detect_number_from_words - ] + self._detect_number_from_words] def detect_number(self, text): """ @@ -224,23 +227,19 @@ def _detect_number_from_words(self, number_list=None, original_list=None): for numeral_text in numeral_text_list: numbers, original_texts = get_number_from_number_word(numeral_text, self.numbers_word_map) full_list = list(zip(numbers, original_texts)) - """ - list() is added to above zip as in python 3, zip() returns a zip object instead of zip function and - our lint checker is matching it for python 3 - """ sorted_full_list = sorted(full_list, key=lambda kv: len(kv[1]), reverse=True) for number, original_text in sorted_full_list: unit = None if self.unit_type: unit, original_text = self._get_unit_from_text(original_text, numeral_text) - # numeral_text = numeral_text.replace(original_text, self.tag) - _pattern = re.compile(r'\b%s\b' % re.escape(original_text), flags=_re_flags) - numeral_text = _pattern.sub(self.tag, numeral_text) - number_list.append({ - NUMBER_DETECTION_RETURN_DICT_VALUE: str(number), - NUMBER_DETECTION_RETURN_DICT_UNIT: unit - }) - original_list.append(original_text) + _pattern = re.compile(self._SPAN_BOUNDARY_TEMPLATE.format(re.escape(original_text)), flags=_re_flags) + if _pattern.search(numeral_text): + numeral_text = _pattern.sub(self.tag, numeral_text, 1) + number_list.append({ + NUMBER_DETECTION_RETURN_DICT_VALUE: str(number), + NUMBER_DETECTION_RETURN_DICT_UNIT: unit + }) + original_list.append(original_text) return number_list, original_list def _detect_number_from_digit(self, number_list=None, original_list=None): @@ -294,12 +293,12 @@ def _detect_number_from_digit(self, number_list=None, original_list=None): number, scale, original_text = None, None, None if pattern[1] and pattern[1].replace(',', '').replace('.', '').isdigit(): number = pattern[1].replace(',', '') - original_text = pattern[0].strip() + original_text = pattern[0].strip().strip(',.').strip() scale = self.scale_map[pattern[2].strip()] elif pattern[3] and pattern[3].replace(',', '').replace('.', '').isdigit(): number = pattern[3].replace(',', '') - original_text = pattern[3].strip() + original_text = pattern[3].strip().strip(',.').strip() scale = 1 if number: @@ -308,13 +307,14 @@ def _detect_number_from_digit(self, number_list=None, original_list=None): unit = None if self.unit_type: unit, original_text = self._get_unit_from_text(original_text, processed_text) - _pattern = re.compile(r'\b%s\b' % re.escape(original_text), flags=_re_flags) - processed_text = _pattern.sub(self.tag, processed_text) - number_list.append({ - NUMBER_DETECTION_RETURN_DICT_VALUE: str(number), - NUMBER_DETECTION_RETURN_DICT_UNIT: unit - }) - original_list.append(original_text) + _pattern = re.compile(self._SPAN_BOUNDARY_TEMPLATE.format(re.escape(original_text)), flags=_re_flags) + if _pattern.search(processed_text): + processed_text = _pattern.sub(self.tag, processed_text, 1) + number_list.append({ + NUMBER_DETECTION_RETURN_DICT_VALUE: str(number), + NUMBER_DETECTION_RETURN_DICT_UNIT: unit + }) + original_list.append(original_text) return number_list, original_list @@ -330,9 +330,9 @@ def _update_processed_text(self, original_number_list): created from entity_name """ for detected_text in original_number_list: - _pattern = re.compile(r'\b%s\b' % re.escape(detected_text), flags=_re_flags) - self.tagged_text = _pattern.sub(self.tag, self.tagged_text) - self.processed_text = _pattern.sub('', self.processed_text) + _pattern = re.compile(self._SPAN_BOUNDARY_TEMPLATE.format(re.escape(detected_text)), flags=_re_flags) + self.tagged_text = _pattern.sub(self.tag, self.tagged_text, 1) + self.processed_text = _pattern.sub('', self.processed_text, 1) class NumberDetector(BaseNumberDetector): diff --git a/ner_v2/detectors/numeral/number_range/README.md b/ner_v2/detectors/numeral/number_range/README.md index 9f3e32841..fb6e72266 100644 --- a/ner_v2/detectors/numeral/number_range/README.md +++ b/ner_v2/detectors/numeral/number_range/README.md @@ -254,8 +254,8 @@ class NumberRangeDetector(BaseNumberRangeDetector): """ number_range_list = number_range_list or [] original_list = original_list or [] - between_range_pattern = re.compile(ur'(between\s+({number}\d+)(?:\s+and|-)' - ur'\s+({number}\d+))'.format(number=NUMBER_REPLACE_TEXT), re.UNICODE) + between_range_pattern = re.compile(ur'(between\s+({number}\d+__)(?:\s+and|-)' + ur'\s+({number}\d+__))'.format(number=NUMBER_REPLACE_TEXT), re.UNICODE) number_range_matches = between_range_pattern.findall(self.processed_text) for match in number_range_matches: number_range, original_text = self._get_number_range(min_part_match=match[1], max_part_match=match[2], diff --git a/ner_v2/detectors/numeral/number_range/en/number_range_detection.py b/ner_v2/detectors/numeral/number_range/en/number_range_detection.py index 56b31a20c..779e0339c 100644 --- a/ner_v2/detectors/numeral/number_range/en/number_range_detection.py +++ b/ner_v2/detectors/numeral/number_range/en/number_range_detection.py @@ -24,8 +24,7 @@ def __init__(self, entity_name, language, unit_type=None): self._detect_min_num_range_with_suffix_variants, self._detect_max_num_range_with_prefix_variants, self._detect_max_num_range_with_suffix_variants, - self._detect_absolute_number - ] + self._detect_absolute_number] def _custom_num_range_between_num_and_num(self, number_range_list=None, original_list=None): """Detects number range of text of pattern between number1 to number2 @@ -42,8 +41,8 @@ def _custom_num_range_between_num_and_num(self, number_range_list=None, original """ number_range_list = number_range_list or [] original_list = original_list or [] - between_range_pattern = re.compile(u'(between\\s+({number}\\d+)(?:\\s+and|-)' - u'\\s+({number}\\d+))'.format(number=NUMBER_REPLACE_TEXT), re.UNICODE) + between_range_pattern = re.compile(r'(between\s+({number}\d+__)\s+(?:and|-)' + r'\s+({number}\d+__))'.format(number=NUMBER_REPLACE_TEXT), re.UNICODE) number_range_matches = between_range_pattern.findall(self.processed_text) for match in number_range_matches: number_range, original_text = self._get_number_range(min_part_match=match[1], max_part_match=match[2], diff --git a/ner_v2/detectors/numeral/number_range/hi/data/number_range_keywords.csv b/ner_v2/detectors/numeral/number_range/hi/data/number_range_keywords.csv index ac0f2e562..6b3bd32bb 100644 --- a/ner_v2/detectors/numeral/number_range/hi/data/number_range_keywords.csv +++ b/ner_v2/detectors/numeral/number_range/hi/data/number_range_keywords.csv @@ -3,4 +3,4 @@ ke upar| k upar| ke uper| k uper| se upar| se uper| se jada | se jyada | se adh kam se kam| कम से कम,-1,min jada se jada | jyada se jyada | lagbhag | ज्यादा से ज्यादा | जादा से जादा | लगभग,-1,max se niche | se kam | se sasta | se saste |ke aaspas| ke aspas | k aaspas| k aspas | ke aas paas| ke aas pas| k aas paas| k aas pas|ke lagbhag| k lagbhag | से नीचे | से कम | से सस्ता | से सस्ते | के आसपास | के आस पास | के लगभग,1,max -se|-|से,0,min_max +se|-|–|से,0,min_max diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index fa24c3d49..3da7677c1 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -1,20 +1,25 @@ # coding=utf-8 from __future__ import absolute_import -import pandas as pd + import collections import os + +import pandas as pd +from six.moves import zip + import ner_v2.detectors.numeral.constant as numeral_constant -from ner_v2.detectors.numeral.utils import get_list_from_pipe_sep_string from ner_v2.detectors.numeral.number.number_detection import NumberDetector -from six.moves import zip +from ner_v2.detectors.numeral.utils import get_list_from_pipe_sep_string try: import regex as re + _re_flags = re.UNICODE | re.V1 | re.WORD except ImportError: import re + _re_flags = re.UNICODE NumberRangeVariant = collections.namedtuple('NumberRangeVariant', ['position', 'range_type']) @@ -64,8 +69,7 @@ def __init__(self, entity_name, language, data_directory_path, unit_type=None): self._detect_min_num_range_with_suffix_variants, self._detect_max_num_range_with_prefix_variants, self._detect_max_num_range_with_suffix_variants, - self._detect_absolute_number - ] + self._detect_absolute_number] def _init_regex_for_range(self, data_directory_path): """ @@ -98,7 +102,7 @@ def _init_regex_for_range(self, data_directory_path): self.min_range_suffix_variants = [re.escape(variant) for variant, value in self.range_variants_map.items() if (value.position == 1 and - value.range_type == numeral_constant.NUMBER_RANGE_MIN_TYPE)] + value.range_type == numeral_constant.NUMBER_RANGE_MIN_TYPE)] self.max_range_prefix_variants = [re.escape(variant) for variant, value in self.range_variants_map.items() if (value.position == -1 and @@ -128,10 +132,11 @@ def _tag_number_in_text(self, processed_text): """ tagged_number_text = processed_text sorted_number_detected_map = sorted(list(self.number_detected_map.items()), - key=lambda kv: len(kv[1].original_text), - reverse=True) - for number_tag in sorted_number_detected_map: - tagged_number_text = tagged_number_text.replace(number_tag[1].original_text, number_tag[0], 1) + key=lambda kv: len(kv[1].original_text), reverse=True) + span_template = self.number_detector.language_number_detector._SPAN_BOUNDARY_TEMPLATE + for number_tag, value_text_pair in sorted_number_detected_map: + tagged_number_text = re.sub(span_template.format(re.escape(value_text_pair.original_text)), number_tag, + tagged_number_text, count=1, flags=_re_flags) return tagged_number_text def _get_number_tag_dict(self): @@ -148,8 +153,8 @@ def _get_number_tag_dict(self): detected_number_dict = {} entity_value_list, original_text_list = self.number_detector.detect_entity(self.processed_text) for index, (entity_value, original_text) in enumerate(zip(entity_value_list, original_text_list)): - detected_number_dict[numeral_constant.NUMBER_REPLACE_TEXT + str(index)] = ValueTextPair( - entity_value=entity_value, original_text=original_text) + key = '{number}{index}__'.format(number=numeral_constant.NUMBER_REPLACE_TEXT, index=index) + detected_number_dict[key] = ValueTextPair(entity_value=entity_value, original_text=original_text) return detected_number_dict def _get_original_text_from_tagged_text(self, number_tag_text): @@ -194,7 +199,7 @@ def detect_number_range(self, text): def _detect_absolute_number(self, number_list, original_list): number_list = number_list or [] original_list = original_list or [] - abs_number_pattern = re.compile(u'({number}\\d+)'.format(number=numeral_constant.NUMBER_REPLACE_TEXT), + abs_number_pattern = re.compile(r'({number}\d+__)'.format(number=numeral_constant.NUMBER_REPLACE_TEXT), re.UNICODE) abs_number_matches = abs_number_pattern.findall(self.processed_text) for match in abs_number_matches: @@ -282,7 +287,7 @@ def _detect_min_num_range_with_prefix_variants(self, number_range_list=None, ori if self.min_range_prefix_variants: min_prefix_choices = '|'.join(self.min_range_prefix_variants) - min_range_start_pattern = re.compile(u'((?:{min_prefix_choices})\\s+({number}\\d+))'.format( + min_range_start_pattern = re.compile(r'((?:{min_prefix_choices})\s+({number}\d+__))'.format( number=numeral_constant.NUMBER_REPLACE_TEXT, min_prefix_choices=min_prefix_choices), re.UNICODE) number_range_matches = min_range_start_pattern.findall(self.processed_text) for match in number_range_matches: @@ -310,7 +315,7 @@ def _detect_min_num_range_with_suffix_variants(self, number_range_list=None, ori if self.min_range_suffix_variants: min_suffix_choices = '|'.join(self.min_range_suffix_variants) - min_range_end_pattern = re.compile(u'(({number}\\d+)\\s+(?:{min_suffix_choices}))'.format( + min_range_end_pattern = re.compile(r'(({number}\d+__)\s+(?:{min_suffix_choices}))'.format( number=numeral_constant.NUMBER_REPLACE_TEXT, min_suffix_choices=min_suffix_choices), re.UNICODE) number_range_matches = min_range_end_pattern.findall(self.processed_text) for match in number_range_matches: @@ -340,7 +345,7 @@ def _detect_max_num_range_with_prefix_variants(self, number_range_list=None, ori if self.max_range_prefix_variants: max_prefix_choices = '|'.join(self.max_range_prefix_variants) - max_range_start_pattern = re.compile(u'((?:{max_prefix_choices})\\s+({number}\\d+))'.format( + max_range_start_pattern = re.compile(r'((?:{max_prefix_choices})\s+({number}\d+__))'.format( number=numeral_constant.NUMBER_REPLACE_TEXT, max_prefix_choices=max_prefix_choices), re.UNICODE) number_range_matches = max_range_start_pattern.findall(self.processed_text) for match in number_range_matches: @@ -369,7 +374,7 @@ def _detect_max_num_range_with_suffix_variants(self, number_range_list=None, ori if self.max_range_suffix_variants: max_suffix_choices = '|'.join(self.max_range_suffix_variants) - max_range_end_pattern = re.compile(u'(({number}\\d+)\\s+(?:{max_suffix_choices}))'.format( + max_range_end_pattern = re.compile(r'(({number}\d+__)\s+(?:{max_suffix_choices}))'.format( number=numeral_constant.NUMBER_REPLACE_TEXT, max_suffix_choices=max_suffix_choices), re.UNICODE) number_range_matches = max_range_end_pattern.findall(self.processed_text) for match in number_range_matches: @@ -399,9 +404,9 @@ def _detect_min_max_num_range(self, number_range_list=None, original_list=None): if self.min_max_range_variants: min_max_choices = '|'.join(self.min_max_range_variants) - min_max_range_pattern = re.compile(u'(({number}\\d+)\\s*(?:{min_max_choices})\\s*' - u'({number}\\d+))'.format(number=numeral_constant.NUMBER_REPLACE_TEXT, - min_max_choices=min_max_choices), re.UNICODE) + min_max_range_pattern = re.compile(r'(({number}\d+__)\s*(?:{min_max_choices})\s*' + r'({number}\d+__))'.format(number=numeral_constant.NUMBER_REPLACE_TEXT, + min_max_choices=min_max_choices), re.UNICODE) number_range_matches = min_max_range_pattern.findall(self.processed_text) for match in number_range_matches: number_range, original_text = self._get_number_range(min_part_match=match[1], max_part_match=match[2], @@ -423,7 +428,7 @@ def _update_tagged_text(self, original_number_list): created from entity_name """ for detected_text in original_number_list: - _pattern = re.compile(u'\\b%s\\b' % re.escape(detected_text), flags=_re_flags) + _pattern = re.compile(r'\b%s\b' % re.escape(detected_text), flags=_re_flags) self.tagged_text = _pattern.sub(self.tag, self.tagged_text) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 684d8e254..77aadaa72 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -1246,7 +1246,7 @@ def _detect_24_hour_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]?([0-5][0-9])\s*({timezone})?)' + patterns = re.findall(r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]([0-5][0-9])\s*({timezone})?)' r'(?!\s*(?:am|pm|a\.m\.?|p\.m\.?|(?:{timezone})|\d))' .format(timezone=self.timezone_choices), self.processed_text.lower()) @@ -1364,11 +1364,10 @@ def _detect_time_without_format_preceeding(self, time_list=None, original_list=N r'({timezone})?)\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) - if not patterns and self.bot_message: - if re.findall(r"Time|time", self.bot_message.lower()): - patterns = re.findall(r'\b(({timezone})?\s*([0-2]?[0-9])' - r'()\s*({timezone})?)\b'.format(timezone=self.timezone_choices), - self.processed_text.lower()) + if not patterns and self.bot_message and re.findall(r"Time|time", self.bot_message.lower()): + patterns = re.findall(r'\b(({timezone})?\s*([0-2]?[0-9])' + r'()\s*({timezone})?)\b'.format(timezone=self.timezone_choices), + self.processed_text.lower()) for pattern in patterns: original = pattern[0].strip() t1 = pattern[2] @@ -1423,9 +1422,7 @@ def _get_meridiem(self, hours, mins, timezone): return 'hrs' if current_hour >= TWELVE_HOUR: current_hour -= 12 - if current_hour < hours: - return PM_MERIDIEM - elif current_hour == hours and current_min < mins: + if (current_hour < hours) or (current_hour == hours and current_min < mins): return PM_MERIDIEM else: if current_hour > hours: @@ -1739,10 +1736,7 @@ def _remove_time_range_entities(self, time_list, original_list): time_list_final = [] original_list_final = [] for i, entity in enumerate(time_list): - if 'range' not in entity: - time_list_final.append(entity) - original_list_final.append(original_list[i]) - elif not entity['range']: + if not entity.get('range'): time_list_final.append(entity) original_list_final.append(original_list[i]) return time_list_final, original_list_final diff --git a/ner_v2/tests/numeral/number/en/number_ner_tests.yaml b/ner_v2/tests/numeral/number/en/number_ner_tests.yaml index 4e6e4bf43..24cfa727c 100644 --- a/ner_v2/tests/numeral/number/en/number_ner_tests.yaml +++ b/ner_v2/tests/numeral/number/en/number_ner_tests.yaml @@ -5,7 +5,7 @@ tests: unit_type: null outputs: - output_id: 1 - value: 100 + value: "100" unit: null original_text: "100" - id: en_2 @@ -13,7 +13,7 @@ tests: unit_type: currency outputs: - output_id: 1 - value: 100 + value: "100" unit: rupees original_text: "rs.100" - id: en_3 @@ -21,7 +21,7 @@ tests: unit_type: null outputs: - output_id: 1 - value: 11.2 + value: "11.2" unit: null original_text: "11.2" - id: en_4 @@ -29,7 +29,7 @@ tests: unit_type: currency outputs: - output_id: 1 - value: 10120 + value: "10120" unit: rupees original_text: "10.12k rupees" - id: en_5 @@ -37,7 +37,7 @@ tests: unit_type: null outputs: - output_id: 1 - value: 1000 + value: "1000" unit: null original_text: "1 thousand" - id: en_6 @@ -45,7 +45,7 @@ tests: unit_type: currency outputs: - output_id: 1 - value: 1000 + value: "1000" unit: rupees original_text: "1 thousand rupees" - id: en_7 @@ -53,7 +53,7 @@ tests: unit_type: null outputs: - output_id: 1 - value: 2200 + value: "2200" unit: null original_text: "2.2k" - id: en_8 @@ -61,7 +61,7 @@ tests: unit_type: currency outputs: - output_id: 1 - value: 2300 + value: "2300" unit: rupees original_text: "2.3k rupees" - id: en_9 @@ -72,3 +72,107 @@ tests: value: null unit: null original_text: null + - id: en_10 + message: "my alpha numeric code is 123ABC678DEF012, got it?" + unit_type: null + outputs: + - output_id: 1 + value: null + unit: null + original_text: null + - id: en_11 + message: "my alpha numeric code is 123 ABC 678 DEF 012, got it?" + unit_type: null + outputs: + - output_id: 1 + value: "123" + unit: null + original_text: "123" + - output_id: 2 + value: "678" + unit: null + original_text: "678" + - output_id: 3 + value: "12" + unit: null + original_text: "012" + - id: en_12 + message: "My name is Chirag Jain. The date is 28th Feb, 28/02/2021 next Monday 9:30 pm. This morning next weekday, tomorrow evening. emails are jain@abc.com chirag@example.com. yes no 1 2 3 12 123 1234 12345 123456 1234567890 918097678009 ALWPG5809L. My number is +911234567890. other number is 7123456789. The city is Mumbai, Maharashtra and Lucknow Delhi. jio phone more Rs. 10. 500 - 1000 rupees. less than 50000 rupees and 3 children. prod01 pillows https://haptik.ai a-b 1-2 a ab active" + unit_type: null + outputs: + - original_text: "1" + output_id: 1 + unit: null + value: "1" + - original_text: "2" + output_id: 2 + unit: null + value: "2" + - original_text: "3" + output_id: 3 + unit: null + value: "3" + - original_text: "12" + output_id: 4 + unit: null + value: "12" + - original_text: "123" + output_id: 5 + unit: null + value: "123" + - original_text: "1234" + output_id: 6 + unit: null + value: "1234" + - original_text: "12345" + output_id: 7 + unit: null + value: "12345" + - original_text: "123456" + output_id: 8 + unit: null + value: "123456" + - original_text: "10" + output_id: 9 + unit: null + value: "10" + - original_text: "500" + output_id: 10 + unit: null + value: "500" + - original_text: "1000" + output_id: 11 + unit: null + value: "1000" + - original_text: "50000" + output_id: 12 + unit: null + value: "50000" + - original_text: "3" + output_id: 13 + unit: null + value: "3" + - original_text: "1" + output_id: 14 + unit: null + value: "1" + - original_text: "2" + output_id: 15 + unit: null + value: "2" + - id: en_13 + message: "My name is Chirag Jain. The date is 28th Feb, 28/02/2021 next Monday 9:30 pm. This morning next weekday, tomorrow evening. emails are jain@abc.com chirag@example.com. yes no 1 2 3 12 123 1234 12345 123456 1234567890 918097678009 ALWPG5809L. My number is +911234567890. other number is 7123456789. The city is Mumbai, Maharashtra and Lucknow Delhi. jio phone more Rs. 10. 500 - 1000 rupees. less than 50000 rupees and 3 children. prod01 pillows https://haptik.ai a-b 1-2 a ab active" + unit_type: currency + outputs: + - original_text: "rs. 10" + output_id: 1 + unit: rupees + value: "10" + - original_text: "1000 rupees" + output_id: 2 + unit: rupees + value: "1000" + - original_text: "50000 rupees" + output_id: 3 + unit: rupees + value: "50000" \ No newline at end of file diff --git a/ner_v2/tests/numeral/number/en/test_number_detection.py b/ner_v2/tests/numeral/number/en/test_number_detection.py index 8511405c0..bcd834583 100644 --- a/ner_v2/tests/numeral/number/en/test_number_detection.py +++ b/ner_v2/tests/numeral/number/en/test_number_detection.py @@ -147,6 +147,9 @@ class NumberDetectorTestMeta(type): def __new__(cls, name, bases, attrs): for test_name, test_fn in cls.yaml_testsuite_generator(): + if test_name in attrs: + raise ValueError('Got duplicate test name {test_name}, please make sure all tests have unique "id"' + .format(test_name=test_name)) attrs[test_name] = test_fn return super(NumberDetectorTestMeta, cls).__new__(cls, name, bases, attrs) diff --git a/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml b/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml index 5d7058747..3a749ae45 100644 --- a/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml +++ b/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml @@ -4,7 +4,7 @@ tests: message: "I want more than 200 banana" outputs: - max_value: null - min_value: '200' + min_value: "200" original_text: "more than 200" output_id: 1 unit: null @@ -14,7 +14,7 @@ tests: message: "My monthly salary will be more than 2k per month" outputs: - max_value: null - min_value: '2000' + min_value: "2000" original_text: "more than 2k" output_id: 1 unit: null @@ -24,7 +24,7 @@ tests: message: "more than 2.5k people in the stadium" outputs: - max_value: null - min_value: '2500' + min_value: "2500" original_text: "more than 2.5k" output_id: 1 unit: null @@ -64,7 +64,7 @@ tests: message: "more than 200 rupees" outputs: - max_value: null - min_value: '200' + min_value: "200" original_text: "more than 200 rupees" output_id: 1 unit: rupees @@ -74,7 +74,7 @@ tests: message: "more than 2k rupees" outputs: - max_value: null - min_value: '2000' + min_value: "2000" original_text: "more than 2k rupees" output_id: 1 unit: rupees @@ -84,7 +84,7 @@ tests: message: "more than 2.5k rupees" outputs: - max_value: null - min_value: '2500' + min_value: "2500" original_text: "more than 2.5k rupees" output_id: 1 unit: rupees @@ -93,8 +93,8 @@ tests: - id: en_10 message: "200 to 300" outputs: - - max_value: '300' - min_value: '200' + - max_value: "300" + min_value: "200" original_text: "200 to 300" output_id: 1 unit: null @@ -103,8 +103,8 @@ tests: - id: en_11 message: "200 - 300" outputs: - - max_value: '300' - min_value: '200' + - max_value: "300" + min_value: "200" original_text: "200 - 300" output_id: 1 unit: null @@ -113,8 +113,8 @@ tests: - id: en_12 message: "200-300" outputs: - - max_value: '300' - min_value: '200' + - max_value: "300" + min_value: "200" original_text: "200-300" output_id: 1 unit: null @@ -150,197 +150,341 @@ tests: unit: null abs_value: null unit_type: currency -# - id: en_16 -# message: "200 to 300 ruppes" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 to 300 ruppes" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_17 -# message: "200 – 300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 – 300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_18 -# message: "200-300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200-300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_19 -# message: "200 rupees to 300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 rupees to 300" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_20 -# message: "200 rupees to 300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 rupees to 300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_21 -# message: "200 rupees – 300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 rupees – 300" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_22 -# message: "200 rupees – 300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 rupees – 300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_23 -# message: "200-300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200-300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_24 -# message: "200k-300k men and around 400 women" -# outputs: -# - max_value: 300000 -# min_value: 200000 -# original_text: "200k-300k" -# output_id: 1 -# unit: null -# - max_value: 400 -# min_value: null -# original_text: "around 400" -# output_id: 2 -# unit: null -# unit_type: null -# - id: en_25 -# message: "200k-300k men and around 300k women" -# outputs: -# - max_value: 300000 -# min_value: 200000 -# original_text: "200k-300k" -# output_id: 1 -# unit: null -# - max_value: 300000 -# min_value: null -# original_text: "around 300k" -# output_id: 2 -# unit: null -# unit_type: null -# - id: en_26 -# message: "between 2000 and 3000" -# outputs: -# - max_value: 3000 -# min_value: 2000 -# original_text: "between 2000 and 3000" -# output_id: 1 -# unit: null -# unit_type: null + - id: en_16 + message: "200 to 300 rupees" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200 to 300 rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: en_17 + message: "200 - 300 rupees" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200 - 300 rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: en_18 + message: "200-300 rupees" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200-300 rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: en_19 + message: "200 rupees to 300" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200 rupees to 300" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: en_20 + message: "200 rupees to 300 rupees" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200 rupees to 300 rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: en_21 + message: "200 rupees - 300" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200 rupees - 300" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: en_22 + message: "200 rupees - 300 rupees" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200 rupees - 300 rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: en_23 + message: "200-300 rupees" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200-300 rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: en_24 + message: "200k-300k men and around 400 women" + outputs: + - max_value: "300000" + min_value: "200000" + original_text: "200k-300k" + output_id: 1 + unit: null + abs_value: null + - max_value: "400" + min_value: null + original_text: "around 400" + output_id: 2 + unit: null + abs_value: null + unit_type: null + - id: en_25 + message: "200k-300k men and around 300k women" + outputs: + - max_value: "300000" + min_value: "200000" + original_text: "200k-300k" + output_id: 1 + unit: null + abs_value: null + - max_value: "300000" + min_value: null + original_text: "around 300k" + output_id: 2 + unit: null + abs_value: null + unit_type: null + - id: en_26 + message: "between 2000 and 3000" + outputs: + - max_value: "3000" + min_value: "2000" + original_text: "between 2000 and 3000" + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: en_27 + message: "My name is Chirag Jain. The date is 28th Feb, 28/02/2021 next Monday 9:30 pm. This morning next weekday, tomorrow evening. emails are jain@abc.com chirag@example.com. yes no 1 2 3 12 123 1234 12345 123456 1234567890 918097678009 ALWPG5809L. My number is +911234567890. other number is 7123456789. The city is Mumbai, Maharashtra and Lucknow Delhi. jio phone more Rs. 10. 500 - 1000 rupees. less than 50000 rupees and 3 children. prod01 pillows https://haptik.ai a-b 1-2 a ab active" + unit_type: null + outputs: + - abs_value: null + max_value: "1000" + min_value: "500" + original_text: "500 - 1000" + output_id: 1 + unit: null + - abs_value: null + max_value: "2" + min_value: "1" + original_text: "1-2" + output_id: 2 + unit: null + - abs_value: null + max_value: "50000" + min_value: null + original_text: "less than 50000" + output_id: 3 + unit: null + - abs_value: "1" + max_value: null + min_value: null + original_text: "1" + output_id: 4 + unit: null + - abs_value: "2" + max_value: null + min_value: null + original_text: "2" + output_id: 5 + unit: null + - abs_value: "3" + max_value: null + min_value: null + original_text: "3" + output_id: 6 + unit: null + - abs_value: "12" + max_value: null + min_value: null + original_text: "12" + output_id: 7 + unit: null + - abs_value: "123" + max_value: null + min_value: null + original_text: "123" + output_id: 8 + unit: null + - abs_value: "1234" + max_value: null + min_value: null + original_text: "1234" + output_id: 9 + unit: null + - abs_value: "12345" + max_value: null + min_value: null + original_text: "12345" + output_id: 10 + unit: null + - abs_value: "123456" + max_value: null + min_value: null + original_text: "123456" + output_id: 11 + unit: null + - abs_value: "1234567890" + max_value: null + min_value: null + original_text: "1234567890" + output_id: 12 + unit: null + - abs_value: "918097678009" + max_value: null + min_value: null + original_text: "918097678009" + output_id: 13 + unit: null + - abs_value: "7123456789" + max_value: null + min_value: null + original_text: "7123456789" + output_id: 14 + unit: null + - abs_value: "10" + max_value: null + min_value: null + original_text: "10" + output_id: 15 + unit: null + - abs_value: "3" + max_value: null + min_value: null + original_text: "3" + output_id: 16 + unit: null + - id: en_28 + message: "My name is Chirag Jain. The date is 28th Feb, 28/02/2021 next Monday 9:30 pm. This morning next weekday, tomorrow evening. emails are jain@abc.com chirag@example.com. yes no 1 2 3 12 123 1234 12345 123456 1234567890 918097678009 ALWPG5809L. My number is +911234567890. other number is 7123456789. The city is Mumbai, Maharashtra and Lucknow Delhi. jio phone more Rs. 10. 500 - 1000 rupees. less than 50000 rupees and 3 children. prod01 pillows https://haptik.ai a-b 1-2 a ab active" + unit_type: currency + outputs: + - abs_value: null + max_value: "1000" + min_value: "500" + original_text: "500 - 1000 rupees" + output_id: 1 + unit: rupees + - abs_value: null + max_value: "50000" + min_value: null + original_text: "less than 50000 rupees" + output_id: 2 + unit: rupees + - abs_value: "10" + max_value: null + min_value: null + original_text: "rs. 10" + output_id: 3 + unit: rupees hi: -# - id: hi_1 -# message: "200 se jyada" -# outputs: -# - max_value: null -# min_value: 200 -# original_text: "200 se jyada" -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_2 -# message: "2k se upar" -# outputs: -# - max_value: null -# min_value: 2000 -# original_text: "2k se upar" -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_3 -# message: "jada se jada 2500" -# outputs: -# - max_value: 2500 -# min_value: null -# original_text: "jada se jada 2500" -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_7 -# message: "200 rupees se jyada" -# outputs: -# - max_value: null -# min_value: 200 -# original_text: "200 rupees se jyada" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_8 -# message: "Rupees 2000 se upar" -# outputs: -# - max_value: null -# min_value: 2000 -# original_text: "Rupees 2000 se upar" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_9 -# message: "jada se jada 2500 rupees" -# outputs: -# - max_value: 2500 -# min_value: null -# original_text: "jada se jada 2500 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_10 -# message: "200 se 300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 se 300" -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_11 -# message: "200 – 300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 – 300" -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_12 -# message: "200-300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200-300" -# output_id: 1 -# unit: null -# unit_type: null + - id: hi_1 + message: "200 se jyada" + outputs: + - max_value: null + min_value: "200" + original_text: "200 se jyada" + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: hi_2 + message: "2k se upar" + outputs: + - max_value: null + min_value: "2000" + original_text: "2k se upar" + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: hi_3 + message: "jada se jada 2500" + outputs: + - max_value: "2500" + min_value: null + original_text: "jada se jada 2500" + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: hi_7 + message: "200 rupees se jyada" + outputs: + - max_value: null + min_value: "200" + original_text: "200 rupees se jyada" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_8 + message: "Rupees 2000 se upar" + outputs: + - max_value: null + min_value: "2000" + original_text: "Rupees 2000 se upar" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_9 + message: "jada se jada 2500 rupees" + outputs: + - max_value: "2500" + min_value: null + original_text: "jada se jada 2500 rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_10 + message: "200 se 300" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200 se 300" + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: hi_11 + message: "200 - 300" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200 - 300" + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: hi_12 + message: "200-300" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200-300" + output_id: 1 + unit: null + abs_value: null + unit_type: null - id: hi_13 message: "200 se 300" outputs: @@ -371,105 +515,116 @@ tests: unit: null abs_value: null unit_type: currency -# - id: hi_16 -# message: "200 se 300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 se 300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_17 -# message: "200 – 300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 – 300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_18 -# message: "200-300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200-300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_19 -# message: "200 rupees se 300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 rupees se 300" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_20 -# message: "200 rupees se 300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 rupees se 300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_21 -# message: "200 rupees – 300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 rupees – 300" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_22 -# message: "200 rupees – 300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 rupees – 300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_23 -# message: "200-300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200-300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_24 -# message: "२०० से ज्यादा" -# outputs: -# - max_value: null -# min_value: 200 -# original_text: "२०० से ज्यादा" -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_25 -# message: "२ हजार से ऊपर" -# outputs: -# - max_value: null -# min_value: 2000 -# original_text: "२ हजार से ऊपर" -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_26 -# message: "ज्यादा से ज्यादा २ हजार" -# outputs: -# - max_value: 2000 -# min_value: null -# original_text: "ज्यादा से ज्यादा २ हजार" -# output_id: 1 -# unit: null -# unit_type: null + - id: hi_16 + message: "200 se 300 rupees" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200 se 300 rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_17 + message: "200 - 300 rupees" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200 - 300 rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_18 + message: "200-300 rupees" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200-300 rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_19 + message: "200 rupees se 300" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200 rupees se 300" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_20 + message: "200 rupees se 300 rupees" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200 rupees se 300 rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_21 + message: "200 rupees - 300" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200 rupees - 300" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_22 + message: "200 rupees - 300 rupees" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200 rupees - 300 rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_23 + message: "200-300 rupees" + outputs: + - max_value: "300" + min_value: "200" + original_text: "200-300 rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_24 + message: "२०० से ज्यादा" + outputs: + - max_value: null + min_value: "200" + original_text: "२०० से ज्यादा" + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: hi_25 + message: "२ हजार से ऊपर" + outputs: + - max_value: null + min_value: "2000" + original_text: "२ हजार से ऊपर" + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: hi_26 + message: "ज्यादा से ज्यादा २ हजार" + outputs: + - max_value: "2000" + min_value: null + original_text: "ज्यादा से ज्यादा २ हजार" + output_id: 1 + unit: null + abs_value: null + unit_type: null - id: hi_27 message: "२०० से ज्यादा" outputs: @@ -500,60 +655,66 @@ tests: unit: null abs_value: null unit_type: currency -# - id: hi_30 -# message: "२०० रूपीस से ज्यादा" -# outputs: -# - max_value: null -# min_value: 200 -# original_text: "२०० रूपीस से ज्यादा" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_31 -# message: "रूपीस २ हजार से ऊपर" -# outputs: -# - max_value: null -# min_value: 2000 -# original_text: रूपीस २ हजार से ऊपर -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_32 -# message: "ज्यादा से ज्यादा ५ हजार रुपया" -# outputs: -# - max_value: 5000 -# min_value: null -# original_text: ज्यादा से ज्यादा ५ हजार रुपया -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_33 -# message: "२०० से ३००" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: २०० से ३०० -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_34 -# message: "२०० – ३००" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२०० – ३००" -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_35 -# message: "२००-३००" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२००-३००" -# output_id: 1 -# unit: null -# unit_type: null + - id: hi_30 + message: "२०० रूपीस से ज्यादा" + outputs: + - max_value: null + min_value: "200" + original_text: "२०० रूपीस से ज्यादा" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_31 + message: "रूपीस २ हजार से ऊपर" + outputs: + - max_value: null + min_value: "2000" + original_text: रूपीस २ हजार से ऊपर + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_32 + message: "ज्यादा से ज्यादा ५ हजार रुपया" + outputs: + - max_value: "5000" + min_value: null + original_text: ज्यादा से ज्यादा ५ हजार रुपया + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_33 + message: "२०० से ३००" + outputs: + - max_value: "300" + min_value: "200" + original_text: २०० से ३०० + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: hi_34 + message: "२०० - ३००" + outputs: + - max_value: "300" + min_value: "200" + original_text: "२०० - ३००" + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: hi_35 + message: "२००-३००" + outputs: + - max_value: "300" + min_value: "200" + original_text: "२००-३००" + output_id: 1 + unit: null + abs_value: null + unit_type: null - id: hi_36 message: "२०० से ३००" outputs: @@ -584,75 +745,83 @@ tests: unit: null abs_value: null unit_type: currency -# - id: hi_39 -# message: "२०० से ३०० रुपया" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२०० से ३०० रुपया" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_40 -# message: "२००-३०० रुपया" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२००-३०० रुपया" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_41 -# message: "२०० रुपया से ३००" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२०० रुपया से ३००" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_42 -# message: "२०० रुपया से ३०० रुपया" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२०० रुपया से ३०० रुपया" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_43 -# message: "२०० - ३०० रुपया" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२०० - ३०० रुपया" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_44 -# message: "२०० रुपया - ३००" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२०० रुपया - ३००" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_45 -# message: "२०० रुपया - ३०० रुपया " -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२०० रुपया - ३०० रुपया" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_46 -# message: "२००-३०० रुपया" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२००-३०० रुपया" -# output_id: 1 -# unit: rupees -# unit_type: currency + - id: hi_39 + message: "२०० से ३०० रुपया" + outputs: + - max_value: "300" + min_value: "200" + original_text: "२०० से ३०० रुपया" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_40 + message: "२००-३०० रुपया" + outputs: + - max_value: "300" + min_value: "200" + original_text: "२००-३०० रुपया" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_41 + message: "२०० रुपया से ३००" + outputs: + - max_value: "300" + min_value: "200" + original_text: "२०० रुपया से ३००" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_42 + message: "२०० रुपया से ३०० रुपया" + outputs: + - max_value: "300" + min_value: "200" + original_text: "२०० रुपया से ३०० रुपया" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_43 + message: "२०० - ३०० रुपया" + outputs: + - max_value: "300" + min_value: "200" + original_text: "२०० - ३०० रुपया" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_44 + message: "२०० रुपया - ३००" + outputs: + - max_value: "300" + min_value: "200" + original_text: "२०० रुपया - ३००" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_45 + message: "२०० रुपया - ३०० रुपया " + outputs: + - max_value: "300" + min_value: "200" + original_text: "२०० रुपया - ३०० रुपया" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: hi_46 + message: "२००-३०० रुपया" + outputs: + - max_value: "300" + min_value: "200" + original_text: "२००-३०० रुपया" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency diff --git a/ner_v2/tests/numeral/number_range/test_number_range_detection.py b/ner_v2/tests/numeral/number_range/test_number_range_detection.py index 4c25a5059..a05c4dcc1 100644 --- a/ner_v2/tests/numeral/number_range/test_number_range_detection.py +++ b/ner_v2/tests/numeral/number_range/test_number_range_detection.py @@ -15,6 +15,9 @@ class NumberRangeDetectorTestMeta(type): def __new__(cls, name, bases, attrs): for test_name, test_fn in cls.yaml_testsuite_generator(): + if test_name in attrs: + raise ValueError('Got duplicate test name {test_name}, please make sure all tests have unique "id"' + .format(test_name=test_name)) attrs[test_name] = test_fn return super(NumberRangeDetectorTestMeta, cls).__new__(cls, name, bases, attrs) diff --git a/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py b/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py index 863243dcf..25b9c9c84 100644 --- a/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py +++ b/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py @@ -15,6 +15,9 @@ class PhoneNumberDetectorTestMeta(type): def __new__(cls, name, bases, attrs): for test_name, test_fn in cls.yaml_testsuite_generator(): + if test_name in attrs: + raise ValueError('Got duplicate test name {test_name}, please make sure all tests have unique "id"' + .format(test_name=test_name)) attrs[test_name] = test_fn return super(PhoneNumberDetectorTestMeta, cls).__new__(cls, name, bases, attrs) diff --git a/ner_v2/tests/temporal/time/test_time_detection.py b/ner_v2/tests/temporal/time/test_time_detection.py index 254619141..70b6bea3b 100644 --- a/ner_v2/tests/temporal/time/test_time_detection.py +++ b/ner_v2/tests/temporal/time/test_time_detection.py @@ -18,6 +18,9 @@ class TimeDetectionTestMeta(type): def __new__(cls, name, bases, attrs): for test_name, test_fn in cls.yaml_testsuite_generator(): + if test_name in attrs: + raise ValueError('Got duplicate test name {test_name}, please make sure all tests have unique "id"' + .format(test_name=test_name)) attrs[test_name] = test_fn return super(TimeDetectionTestMeta, cls).__new__(cls, name, bases, attrs) @@ -45,7 +48,7 @@ def parse_expected_outputs(expected_outputs): "hh": expected_output["hh"], "mm": expected_output["mm"], "nn": expected_output["nn"], - 'tz': expected_output["tz"], + "tz": expected_output["tz"], "range": expected_output["range"], "time_type": expected_output["time_type"] } diff --git a/ner_v2/tests/temporal/time/time_ner_tests.yaml b/ner_v2/tests/temporal/time/time_ner_tests.yaml index 6c84a7282..557c5bab7 100644 --- a/ner_v2/tests/temporal/time/time_ner_tests.yaml +++ b/ner_v2/tests/temporal/time/time_ner_tests.yaml @@ -804,6 +804,19 @@ tests: range: null time_type: null original_text: "once in 3 days" + - id: en_62 + message: "your three digit code is 145 and 4 digit code is 1230. Please enter this on the app. 12 is the two digit code" + bot_message: null + range_enabled: false + outputs: + - hh: null + mm: null + nn: null + tz: null + original_text: null + output_id: 1 + range: null + time_type: null hi: - id: hi_1 diff --git a/postman_tests/data/entities/numberV2.json b/postman_tests/data/entities/numberV2.json index a65496b04..4e581df4d 100644 --- a/postman_tests/data/entities/numberV2.json +++ b/postman_tests/data/entities/numberV2.json @@ -429,8 +429,7 @@ }, "expected": [ { - "original_text": "6754321", - "value": "6754321" + "data": null } ] }, diff --git a/requirements.txt b/requirements.txt index 13ecaa3bb..ac005bf35 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,24 +11,25 @@ Django==1.11.29 django-dotenv==1.4.2 weighted-levenshtein==0.1 regex==2018.7.11 -ipython word2number==1.1 python-crfsuite==0.9.6 boto==2.49.0 boto3==1.8.4 python-dateutil==2.7.3 pandas==0.19.0 -mock==2.0.0 -django-nose==1.4.5 -typing==3.6.2 -flake8==3.4.1 pyaml==19.4.1 -coverage==4.5.3 -nose-exclude==0.5.0 spacy==2.3.2 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz#egg=en_core_web_sm https://github.com/explosion/spacy-models/releases/download/nl_core_news_sm-2.3.0/nl_core_news_sm-2.3.0.tar.gz#egg=nl_core_news_sm https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.3.0/fr_core_news_sm-2.3.0.tar.gz#egg=fr_core_news_sm https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.3.0/de_core_news_sm-2.3.0.tar.gz#egg=de_core_news_sm https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.3.1/es_core_news_sm-2.3.1.tar.gz#egg=es_core_news_sm -sentry-sdk==0.14.1 +typing==3.6.2 +flake8==3.4.1 +mock==2.0.0 +coverage==5.5 +nose-exclude==0.5.0 +django-nose==1.4.7 +sentry-sdk==0.20.3 +jedi==0.17.2 +ipython==7.16.1