diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 000000000..fa58ffe57
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,41 @@
+[run]
+source = .
+omit =
+ *.pyc
+ *.pyo
+ */site-packages/*
+ */distutils/*
+ docs/
+ docker/
+ logs/
+ postman_tests/
+ */tests/*
+ */test.py
+ */tests.py
+ manage.py
+ */settings.py
+ */urls.py
+ */migrations/*
+ *wsgi.py
+ *__init__.py
+
+[report]
+skip_empty = True
+sort = Cover
+exclude_lines =
+ pragma: no cover
+
+ # Don't complain about missing debug-only code:
+ def __repr__
+ if self\.debug
+
+ # Don't complain if tests don't hit defensive assertion code:
+ raise AssertionError
+ raise NotImplementedError
+
+ # Don't complain if non-runnable code isn't run:
+ if 0:
+ if __name__ == .__main__.:
+
+ __author__ = 'haptik'
+show_missing = True
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 89c33629c..4d72385b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -103,7 +103,7 @@ ENV/
/newrelic.ini
sftp-config.json
.DS_Store
-logs/
+logs/*.log*
.vscode
newman_reports/
diff --git a/chatbot_ner/config.py b/chatbot_ner/config.py
index 0720b40c8..ffd1820fb 100644
--- a/chatbot_ner/config.py
+++ b/chatbot_ner/config.py
@@ -13,14 +13,10 @@
LOG_PATH = os.path.join(BASE_DIR, 'logs')
# TODO: Set this up via Django LOGGING
-# SET UP NER LOGGING
-if not os.path.exists(LOG_PATH):
- os.makedirs(LOG_PATH)
-
LOG_LEVEL = os.environ.get('DJANGO_LOG_LEVEL', 'error').upper()
# Common formatter
-formatter = logging.Formatter("%(asctime)s\t%(levelname)s\t%(message)s", "%Y-%m-%d %H:%M:%S")
+formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s %(module)s:%(lineno)d")
# Handler for Docker stdout
handler_stdout = logging.StreamHandler()
@@ -29,28 +25,14 @@
# SETUP NER LOGGING
NER_LOG_FILENAME = os.path.join(LOG_PATH, 'ner_log.log')
-# Set up a specific logger with our desired output level
-ner_logger = logging.getLogger('NERLogger')
-ner_logger.setLevel(LOG_LEVEL)
-# Add the log message handler to the logger
handler = logging.handlers.WatchedFileHandler(NER_LOG_FILENAME)
-# handler = logging.handlers.RotatingFileHandler(NER_LOG_FILENAME, maxBytes=10 * 1024 * 1024, backupCount=5)
handler.setFormatter(formatter)
+
+ner_logger = logging.getLogger('NERLogger')
+ner_logger.setLevel(LOG_LEVEL)
ner_logger.addHandler(handler)
ner_logger.addHandler(handler_stdout)
-# SETUP NLP LIB LOGGING
-NLP_LIB_LOG_FILENAME = os.path.join(LOG_PATH, 'nlp_log.log')
-# Set up a specific logger with our desired output level
-nlp_logger = logging.getLogger('NLPLibLogger')
-nlp_logger.setLevel(LOG_LEVEL)
-# Add the log message handler to the logger
-handler = logging.handlers.WatchedFileHandler(NLP_LIB_LOG_FILENAME)
-# handler = logging.handlers.RotatingFileHandler(NLP_LIB_LOG_FILENAME, maxBytes=10 * 1024 * 1024, backupCount=5)
-handler.setFormatter(formatter)
-nlp_logger.addHandler(handler)
-nlp_logger.addHandler(handler_stdout)
-
ENGINE = os.environ.get('ENGINE')
# ES settings (Mandatory to use Text type entities)
ES_SCHEME = os.environ.get('ES_SCHEME', 'http')
diff --git a/chatbot_ner/settings.py b/chatbot_ner/settings.py
index 84b9ff273..f8d8c77af 100755
--- a/chatbot_ner/settings.py
+++ b/chatbot_ner/settings.py
@@ -10,6 +10,7 @@
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
from __future__ import absolute_import
+
import os
import sys
@@ -119,27 +120,25 @@ def __getitem__(self, item):
'CONN_MAX_AGE': 60
}
-# MIGRATION_MODULES = DisableMigrations()
-
-
TEST_RUNNER = 'django_nose.NoseTestSuiteRunner'
NOSE_ARGS = [
'--nocapture',
'--nologcapture',
'--verbosity=3',
- '--ignore-files=urls.py',
- '--ignore-files=wsgi.py',
+ '--exclude-dir=chatbot_ner/',
+ '--exclude-dir=docs/',
+ '--exclude-dir=docker/',
+ '--exclude-dir=data/',
'--ignore-files=manage.py',
'--ignore-files=nltk_setup.py',
'--ignore-files=__init__.py',
'--ignore-files=const.py',
'--ignore-files=constant.py',
'--ignore-files=constants.py',
- '--ignore-files=settings.py',
'--ignore-files=run_postman_tests.py',
- '--exclude-dir=docs/',
- '--exclude-dir=docker/',
- '--exclude-dir=data/',
+ '--cover-erase',
+ '--cover-package=datastore,external_api,language_utilities,lib,models,ner_v1,ner_v2',
+ '--cover-inclusive',
]
# Internationalization
diff --git a/chatbot_ner/setup_sentry.py b/chatbot_ner/setup_sentry.py
index 9446464a0..4b8e53a0a 100644
--- a/chatbot_ner/setup_sentry.py
+++ b/chatbot_ner/setup_sentry.py
@@ -9,8 +9,8 @@
# Support for Sentry DSN
SENTRY_DSN = os.environ.get('SENTRY_DSN')
-SENTRY_ENABLED = os.environ.get('SENTRY_ENABLED')
-SENTRY_ENABLED = True if SENTRY_ENABLED == 'True' and 'test' not in sys.argv else False
+_sentry_enabled = (os.environ.get('SENTRY_ENABLED') or '').strip().lower()
+SENTRY_ENABLED = (_sentry_enabled == 'true' and 'test' not in sys.argv)
def setup_sentry():
diff --git a/logs/.gitkeep b/logs/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/ner_v1/detectors/textual/name/hindi_const.py b/ner_v1/detectors/textual/name/lang_constants.py
similarity index 100%
rename from ner_v1/detectors/textual/name/hindi_const.py
rename to ner_v1/detectors/textual/name/lang_constants.py
diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py
index bbb0dcd64..31ef23640 100644
--- a/ner_v1/detectors/textual/name/name_detection.py
+++ b/ner_v1/detectors/textual/name/name_detection.py
@@ -9,14 +9,13 @@
from language_utilities.constant import (ENGLISH_LANG, INDIC_LANGUAGES_SET, EUROPEAN_LANGUAGES_SET)
from ner_v1.constant import DATASTORE_VERIFIED, MODEL_VERIFIED
from ner_v1.constant import EMOJI_RANGES, FIRST_NAME, MIDDLE_NAME, LAST_NAME
-from ner_v1.detectors.textual.name.hindi_const import (INDIC_BADWORDS, INDIC_QUESTIONWORDS,
- INDIC_STOPWORDS, NAME_VARIATIONS, INDIC_UNICODE_RANGE,
- COMMON_INDIC_WORDS_OCCURRING_WITH_NAME)
+from ner_v1.detectors.textual.name.lang_constants import (INDIC_BADWORDS, INDIC_QUESTIONWORDS,
+ INDIC_STOPWORDS, NAME_VARIATIONS, INDIC_UNICODE_RANGE,
+ COMMON_INDIC_WORDS_OCCURRING_WITH_NAME)
from six.moves import range
# TODO: Refactor this module for readability and useability. Remove any hacks
-# TODO: Make this module python 3 compatible
class NameDetector(object):
"""
@@ -62,21 +61,24 @@ def get_format_name(name_tokens, text):
2.The original text.
Args:
- name_tokens (list): List of tokens in the name
- Example:
- ['yash', 'doshi']
+ name_tokens (list): List of tokens in the name. e.g. ['yash', 'doshi']
Returns:
- (
- [{first_name: "yash", middle_name: None, last_name: "doshi"}],
- ["yash modi"]
- )
+ (list, list): tuple containing
+ list: list of dictionaries, one for each detected name
+ list: list of str, the original text span for each detected name
+
+ Examples:
+ >>> NameDetector.get_format_name(['yash', 'p.', 'm.', 'doshi'], 'my name is yash p. m. doshi')
+ ([{first_name: 'yash', middle_name: 'p. m.', last_name: 'doshi'}],
+ ['yash p. m. doshi'])
"""
entity_value = []
original_text = []
+ if not name_tokens:
+ return entity_value, original_text
name_text = " ".join(name_tokens)
-
first_name = name_tokens[0]
middle_name = None
last_name = None
@@ -166,7 +168,7 @@ def detect_entity(self, text, bot_message=None, predetected_values=None, **kwarg
if self.language in EUROPEAN_LANGUAGES_SET | {ENGLISH_LANG}:
entity_value, original_text = self.detect_english_name()
elif self.language in INDIC_LANGUAGES_SET:
- entity_value, original_text = self.detect_hindi_name()
+ entity_value, original_text = self.detect_indic_name()
for entity_value_dict in entity_value:
entity_value_dict.update({DATASTORE_VERIFIED: True, MODEL_VERIFIED: False})
@@ -201,7 +203,7 @@ def detect_english_name(self, text=None):
entity_value, original_text = self.get_name_using_pos_tagger(text)
return entity_value, original_text
- def detect_hindi_name(self):
+ def detect_indic_name(self):
"""
This method is used to detect Hindi names from the provided text
@@ -216,7 +218,7 @@ def detect_hindi_name(self):
>> [{first_name: u"प्रतिक", middle_name: u"श्रीदत्त", last_name: u"जयराओ"}], [ u'प्रतिक श्रीदत्त जयराओ']
"""
- if self.detect_abusive_phrases_hindi(text=self.text) or self.detect_question_hindi(text=self.text):
+ if self.detect_abusive_phrases_indic(text=self.text) or self.detect_question_indic(text=self.text):
return [], []
text = self.remove_emojis(text=self.text)
@@ -224,7 +226,7 @@ def detect_hindi_name(self):
regex = re.compile(u'[^{unicode_range}\\s]+'.format(unicode_range=INDIC_UNICODE_RANGE[self.language]), re.U)
text = regex.sub(string=text, repl='')
- entity_value, original_text = self.get_hindi_names_without_regex(text=text)
+ entity_value, original_text = self.get_indic_names_without_regex(text=text)
# Further check for name, if it might have been written in latin script.
if not entity_value:
english_present_regex = re.compile(u'[a-zA-Z]+', re.U)
@@ -364,6 +366,7 @@ def detect_person_name_entity(self, replaced_text):
def context_check_botmessage(self, botmessage):
"""
Checks if previous botmessage conatins name as a keyword or not
+
Args:
botmessage: it consists of the previous botmessage
@@ -377,12 +380,12 @@ def context_check_botmessage(self, botmessage):
botmessage = regex_pattern.sub(r'', botmessage)
botmessage = " " + botmessage.lower().strip() + " "
- for variant in NAME_VARIATIONS[self.language]:
+ for variant in NAME_VARIATIONS.get(self.language, []):
if " " + variant + " " in botmessage:
return True
return False
- def get_hindi_names_without_regex(self, text):
+ def get_indic_names_without_regex(self, text):
"""
This method is used to get detect hindi names without any regex pattern (This method is called only if
detection from regex patterns fails)
@@ -430,7 +433,7 @@ def replace_stopwords_hindi(self, text):
return ""
- def detect_abusive_phrases_hindi(self, text):
+ def detect_abusive_phrases_indic(self, text):
"""
This method is used to check for hindi abuses in the sentence
Args:
@@ -457,7 +460,7 @@ def remove_emojis(self, text):
text = emoji_pattern.sub(repl='', string=text)
return text
- def detect_question_hindi(self, text):
+ def detect_question_indic(self, text):
"""
This method is used to detect if the given text has a hindi question present in it
Args:
diff --git a/ner_v1/static/index.html b/ner_v1/static/index.html
deleted file mode 100644
index 44f8fe0ab..000000000
--- a/ner_v1/static/index.html
+++ /dev/null
@@ -1,155 +0,0 @@
-
-
-
- Haptik NER
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/ner_v1/static/ner_dashboard.css b/ner_v1/static/ner_dashboard.css
deleted file mode 100644
index b2fc4a4ec..000000000
--- a/ner_v1/static/ner_dashboard.css
+++ /dev/null
@@ -1,45 +0,0 @@
-.top-header{
- text-align: center;
- background: #4db9e6;
- font-family: 'latobold';
- font-size: 44px;
- color: white;
- padding: 10px;
-}
-
-.inputlabel{
- //margin-right: 50px;
- display: inline-block;
- height: 34px;
- padding-top: 8px;
-}
-
-.dropdown{
- padding-left: 0px !important;
-}
-.tab-pane{
- padding-top: 30px;
- padding-left: 30px;
-}
-
-.input-group, #entitynamefield{
- padding-top: 10px;
-}
-
-#entitynamefield{
- padding-top: 10px;
-}
-
-#entitysubmitbtn{
- margin-top: 10px;
-}
-
-pre {outline: 1px solid #ccc; padding: 5px; margin: 5px; }
-.string { color: green; }
-.number { color: darkorange; }
-.boolean { color: blue; }
-.null { color: magenta; }
-.key { color: red; }
-
-
-
diff --git a/ner_v1/static/ner_dashboard.js b/ner_v1/static/ner_dashboard.js
deleted file mode 100644
index cd5348336..000000000
--- a/ner_v1/static/ner_dashboard.js
+++ /dev/null
@@ -1,90 +0,0 @@
-$(document).ready(function(){
-
- $("#typedropdown li a").click(function(event){
- $(this).parent().parent().siblings(".btn:first-child").html($(this).text()+' ');
- var entitytype = $(this).text();
- if (entitytype == "Text"){
- $("#entitynamefield").show();
- $("#entitynamefield").empty().html("");
- $("#entitynamefield li a").click(function(event){
- $(this).parent().parent().siblings(".btn:first-child").html($(this).text()+' ');
- });
- }
- else{
- $("#entitynamefield").show();
- $("#entitynamefield").empty().html("
")
- }
-
- });
-
-
-
- $("#entitysubmitbtn").click(function(){
- var entityType = $("#entitytypes").text();
- var entityName = $("#entitynames").text();
- if (!entityName){
- entityName = $("#entitynameinput").val();
- }
- var structuredValue = $("#structuredvalue").val();
- var botMessage = $("#botmessage").val();
- var message = $("#message").val();
- var fallbackValue = $("#fallbackvalue").val();
-
- var entityUrl = "/v1/" + entityType.trim().toLowerCase().replace(/ /g,"_") + "/";
-
- $.ajax({
- url: entityUrl,
- type: "get",
- contentType:"application/json",
- data: {
- message: message.trim(),
- entity_name: entityName.trim().toLowerCase(),
- structured_value: structuredValue.trim(),
- bot_message: botMessage.trim(),
- fallback_value: fallbackValue.trim(),
-
- },
- success: function(data, textStatus, XmlHttpRequest){
- var str = JSON.stringify(data, undefined, 4);
- output(syntaxHighlight(str));
- },
- error: function(xhr, a, b){
- var errorMessage = 'Oops! Something went wonrg, please check your input data';
- output(errorMessage);
- }
-
- });
-
- });
-
- var obj = {a:1, 'b':'foo', c:[false,'false',null, 'null', {d:{e:1.3e5,f:'1.3e5'}}]};
- var str = JSON.stringify(obj, undefined, 4);
-
-});
-
-function output(inp) {
- $("#entityoutput").html(inp);
-}
-
-/*
- Function to sytax highlight the JSON. Gotten by stack overflow answer
- https://stackoverflow.com/questions/4810841/how-can-i-pretty-print-json-using-javascript
-*/
-function syntaxHighlight(json) {
- json = json.replace(/&/g, '&').replace(//g, '>');
- return json.replace(/("(\\u[a-zA-Z0-9]{4}|\\[^u]|[^\\"])*"(\s*:)?|\b(true|false|null)\b|-?\d+(?:\.\d*)?(?:[eE][+\-]?\d+)?)/g, function (match) {
- var cls = 'number';
- if (/^"/.test(match)) {
- if (/:$/.test(match)) {
- cls = 'key';
- } else {
- cls = 'string';
- }
- } else if (/true|false/.test(match)) {
- cls = 'boolean';
- } else if (/null/.test(match)) {
- cls = 'null';
- }
- return '' + match + '';
- });
-}
diff --git a/ner_v2/api.py b/ner_v2/api.py
index c863c44ae..864f74062 100644
--- a/ner_v2/api.py
+++ b/ner_v2/api.py
@@ -1,25 +1,25 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
+
+import json
+
+import six
+from django.http import HttpResponse, JsonResponse
+from django.views.decorators.csrf import csrf_exempt
+
from chatbot_ner.config import ner_logger
+from language_utilities.constant import ENGLISH_LANG
from ner_constants import PARAMETER_MESSAGE, PARAMETER_ENTITY_NAME, PARAMETER_STRUCTURED_VALUE, \
PARAMETER_FALLBACK_VALUE, \
PARAMETER_BOT_MESSAGE, PARAMETER_TIMEZONE, PARAMETER_LANGUAGE_SCRIPT, PARAMETER_SOURCE_LANGUAGE, \
PARAMETER_PAST_DATE_REFERENCED, PARAMETER_MIN_DIGITS, PARAMETER_MAX_DIGITS, PARAMETER_NUMBER_UNIT_TYPE, \
PARAMETER_LOCALE, PARAMETER_RANGE_ENABLED
-
-from ner_v2.detectors.temporal.date.date_detection import DateAdvancedDetector
-from ner_v2.detectors.temporal.time.time_detection import TimeDetector
from ner_v2.detectors.numeral.number.number_detection import NumberDetector
from ner_v2.detectors.numeral.number_range.number_range_detection import NumberRangeDetector
-
-from ner_v2.detectors.textual.utils import get_text_entity_detection_data, verify_text_request
-from language_utilities.constant import ENGLISH_LANG
from ner_v2.detectors.pattern.phone_number.phone_number_detection import PhoneDetector
-
-from django.views.decorators.csrf import csrf_exempt
-from django.http import HttpResponse
-import json
-import six
+from ner_v2.detectors.temporal.date.date_detection import DateAdvancedDetector
+from ner_v2.detectors.temporal.time.time_detection import TimeDetector
+from ner_v2.detectors.textual.utils import get_text_entity_detection_data, verify_text_request
def get_parameters_dictionary(request):
@@ -163,7 +163,7 @@ def date(request):
ner_logger.exception('Exception for date: %s ' % e)
return HttpResponse(status=500)
- return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
+ return JsonResponse({'data': entity_output})
@csrf_exempt
@@ -242,7 +242,7 @@ def time(request):
ner_logger.exception('Exception for time: %s ' % e)
return HttpResponse(status=500)
- return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
+ return JsonResponse({'data': entity_output})
@csrf_exempt
@@ -338,7 +338,7 @@ def number(request):
ner_logger.exception('Exception for numeric: %s ' % e)
return HttpResponse(status=500)
- return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
+ return JsonResponse({'data': entity_output})
@csrf_exempt
@@ -409,7 +409,7 @@ def number_range(request):
ner_logger.exception('Exception for numeric: %s ' % e)
return HttpResponse(status=500)
- return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
+ return JsonResponse({'data': entity_output})
@csrf_exempt
@@ -554,7 +554,7 @@ def phone_number(request):
ner_logger.exception('Exception for phone_number: %s ' % e)
return HttpResponse(status=500)
- return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
+ return JsonResponse({'data': entity_output})
@csrf_exempt
@@ -668,7 +668,7 @@ def text(request):
if request.method == "GET":
response = {"success": False, "error": "Get method is not allowed"}
- return HttpResponse(json.dumps(response), status=501)
+ return JsonResponse(response, status=405)
elif request.method == "POST":
ner_logger.debug("Fetching result")
@@ -682,22 +682,18 @@ def text(request):
response = {"success": False, "error": str(err)}
# TODO: move to ner_logger.error
ner_logger.exception(response)
- return HttpResponse(json.dumps(response), content_type='application/json',
- status=400)
+ return JsonResponse(response, status=400)
except TypeError as err:
response = {"success": False, "error": str(err)}
ner_logger.exception(response)
- return HttpResponse(json.dumps(response), content_type='application/json',
- status=400)
+ return JsonResponse(response, status=400)
except Exception as err:
response = {"success": False, "error": str(err)}
ner_logger.exception(response)
- return HttpResponse(json.dumps(response), content_type='application/json',
- status=400)
-
+ return JsonResponse(response, status=500)
if data:
response = {"success": True, "error": None, "data": data}
- return HttpResponse(json.dumps(response), content_type='application/json', status=200)
+ return JsonResponse(response, status=200)
else:
response = {"success": False, "error": "Some error while parsing"}
- return HttpResponse(json.dumps(response), status=400)
+ return JsonResponse(response, status=500)
diff --git a/ner_v2/detectors/numeral/number/hi/data/numerals_constant.csv b/ner_v2/detectors/numeral/number/hi/data/numerals_constant.csv
index 47e0a4922..483b8de05 100644
--- a/ner_v2/detectors/numeral/number/hi/data/numerals_constant.csv
+++ b/ner_v2/detectors/numeral/number/hi/data/numerals_constant.csv
@@ -5,51 +5,51 @@ number,name_variants,number_value,number_type
२,दो|do|du,2,unit
२.५,ढाई|ढ़ाई|धाइ|धाई|dhaai|daai|dhai|dai,2.5,unit
३,तीन|teen|tin,3,unit
-४,चार |char|chaar,4,unit
+४,चार|char|chaar,4,unit
५,पाँच|पांच|panch|paanch|paach,5,unit
-६,छह |chhe|chhah|chheh,6,unit
-७,सात |saat,7,unit
-८,आठ |ath|aath,8,unit
-९,नौ |nau|nao,9,unit
-१०,दस | dus|das,10,unit
-११,ग्यारह |gyareh|gyarah,11,unit
-१२,बारह | bareh|barah,12,unit
-१३,तेरह | terah|tereh,13,unit
-१४,चौदह |chaudeh|chaudah|chauda,14,unit
-१५,पन्द्रह | pandreh | pandrah|pehdrah|pendreh,15,unit
-१६,सोलह |solah|soleh|sholah|sholeh,16,unit
-१७,सत्रह |satreh|starah,17,unit
-१८,अठारह | athrah|athreh|aththarah|aththareh,18,unit
-१९,उन्नीस | unnis| unnish,19,unit
-२०,बीस | bis|bish|bees|beesh,20,unit
-२१,इक्कीस | ikkis|ikkish|ekkis|ekkish,21,unit
-२२,बाईस | bais| baish|bayis|bayish,22,unit
-२३,तेईस | teis|teish|teyis|teyish,23,unit
-२४,चौबीस | chaubis|chaubish|chaubees|chaubeesh,24,unit
-२५,पच्चीस |pachis|pachish|pachchis|pachchish|pachees|pacheesh|pachchees|pachcheesh,25,unit
-२६,छब्बीस | chhabis|chhabish|chhabees|chhabeesh|chhabbis|chhabbish|chhabbees|chhabbeesh,26,unit
-२७,सत्ताईस | sattais|sattaish|sattaees|sattaeesh,27,unit
-२८,अट्ठाईस | athais|athaish|athaees|athaeesh|aththais|aththaish|aththaees|aththaeesh,28,unit
-२९,उनतीस | untis|untish|untees|unteesh,29,unit
-३०,तीस | tis| tish|tees|teesh,30,unit
-३१,इकतीस |ikkatis|ikkatish|ikattis|ikattish|ikkattis|ikkattish|ekkatis|ekkatish|ekattis|ekattish|ekkattis|ekkattish,31,unit
+६,छह|chhe|chhah|chheh,6,unit
+७,सात|saat,7,unit
+८,आठ|ath|aath,8,unit
+९,नौ|nau|nao,9,unit
+१०,दस| dus|das,10,unit
+११,ग्यारह|gyareh|gyarah,11,unit
+१२,बारह| bareh|barah,12,unit
+१३,तेरह| terah|tereh,13,unit
+१४,चौदह|chaudeh|chaudah|chauda,14,unit
+१५,पन्द्रह| pandreh | pandrah|pehdrah|pendreh,15,unit
+१६,सोलह|solah|soleh|sholah|sholeh,16,unit
+१७,सत्रह|satreh|starah,17,unit
+१८,अठारह| athrah|athreh|aththarah|aththareh,18,unit
+१९,उन्नीस| unnis| unnish,19,unit
+२०,बीस| bis|bish|bees|beesh,20,unit
+२१,इक्कीस| ikkis|ikkish|ekkis|ekkish,21,unit
+२२,बाईस| bais| baish|bayis|bayish,22,unit
+२३,तेईस| teis|teish|teyis|teyish,23,unit
+२४,चौबीस| chaubis|chaubish|chaubees|chaubeesh,24,unit
+२५,पच्चीस|pachis|pachish|pachchis|pachchish|pachees|pacheesh|pachchees|pachcheesh,25,unit
+२६,छब्बीस| chhabis|chhabish|chhabees|chhabeesh|chhabbis|chhabbish|chhabbees|chhabbeesh,26,unit
+२७,सत्ताईस| sattais|sattaish|sattaees|sattaeesh,27,unit
+२८,अट्ठाईस| athais|athaish|athaees|athaeesh|aththais|aththaish|aththaees|aththaeesh,28,unit
+२९,उनतीस| untis|untish|untees|unteesh,29,unit
+३०,तीस| tis| tish|tees|teesh,30,unit
+३१,इकतीस|ikkatis|ikkatish|ikattis|ikattish|ikkattis|ikkattish|ekkatis|ekkatish|ekattis|ekattish|ekkattis|ekkattish,31,unit
३२,बत्तीस| batis|batish|battis|battish|batees|bateesh|battees|batteesh,32,unit
३३,तैंतीस|taitis|taitish|taitees|taiteesh|taintis|taintish|taintees|tainteesh,33,unit
-३४,चौंतीस | chautis|chautish|chautees|chauteesh|chauntis|chauntish|chauntees|chaunteesh,34,unit
+३४,चौंतीस| chautis|chautish|chautees|chauteesh|chauntis|chauntish|chauntees|chaunteesh,34,unit
३५,पैंतीस|paitis|paitish|paitees|paiteesh|paintis|paintish|paintees|painteesh,35,unit
३६,छत्तीस|chhatis|chhatish|chhatees|chhateesh|chhattis|chhattish|chhattees|chhatteesh,36,unit
३७,सैंतीस|saitis|saitish|saitees|saiteesh|saintis|saintish|saintees|sainteesh,37,unit
३८,अड़तीस|adtis|adtish|adtees|adteesh,38,unit
-३९,उनतालीस |unchalis|unchalish|unchalees|unchaleesh,39,unit
+३९,उनतालीस|unchalis|unchalish|unchalees|unchaleesh,39,unit
४०,चालीस|chalis|chalish|chalees|chaleesh,40,unit
४१,इकतालीस|iktalis|iktalish|iktalees|iktaleesh|ektalis|ektalish|ektalees|ektaleesh,41,unit
४२,बयालीस|bayalis|bayalish|bayalees|bayaleesh,42,unit
४३,तैंतालीस|taitalis|taitalish|taitalees|taitaleesh|taintalis|taintalish|taintalees|taintaleesh,43,unit
४४,चौंतालीस|चौवालिश|चौवालिस|chautalis|chautalish|chautalees|chautaleesh|chauntalis|chauntalish|chauntalees|chauntaleesh|chauvalis|chauvalish|chauvalees|chauvaleesh,44,unit
-४५,पैंतालीस |paitalis|paitalish|paitalees|paitaleesh|paintalis|paintalish|paintalees|paintaleesh,45,unit
+४५,पैंतालीस|paitalis|paitalish|paitalees|paitaleesh|paintalis|paintalish|paintalees|paintaleesh,45,unit
४६,छियालीस|chhiyalis|chhiyalish|chhiyalees|chhiyaleesh|chhialis|chhialish|chhialees|chhialeesh,46,unit
-४७,सैंतालीस |saitalis|saitalish|saitalees|saitaleesh|saintalis|saintalish|saintalees|saintaleesh,47,unit
-४८,अड़तालीस |adtalis|adtalish|adtalees|adtaleesh,48,unit
+४७,सैंतालीस|saitalis|saitalish|saitalees|saitaleesh|saintalis|saintalish|saintalees|saintaleesh,47,unit
+४८,अड़तालीस|adtalis|adtalish|adtalees|adtaleesh,48,unit
४९,उनचास|unchaas|unchaash|unchas|unchash,49,unit
५०,पचास|pachas|pachash|pachaas|pachaash,50,unit
५१,इक्याबन|इक्याबन|ikyavan|ikyawan|ekyavan|ekyawan,51,unit
@@ -102,6 +102,6 @@ number,name_variants,number_value,number_type
९८,अट्ठानवे|aththanve|aththanwe|aththanave|aththanawe|athanve|athanwe|athanave|athanawe,98,unit
९९,निन्यानवे|निन्यानबे|ninyanbe|ninyanabe|ninyanve|ninyanave|ninyanwe|ninyanawe,99,unit
१००,सौ|sau|sao,100,scale
-१०००,हज़ार|हजार|hajar|hajaar|hazar|hazaar,1000,scale
+१०००,हज़ार|हजार|hajar|hajaar|hazar|hazaar|k,1000,scale
१०००००,लाख|lakh|laakh|lac,100000,scale
१०००००००,करोड़|crore|karor|caror,10000000,scale
diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py
index b7e86b463..0c454f64e 100644
--- a/ner_v2/detectors/numeral/number/standard_number_detector.py
+++ b/ner_v2/detectors/numeral/number/standard_number_detector.py
@@ -1,8 +1,10 @@
# coding=utf-8
from __future__ import absolute_import
-import pandas as pd
+
import collections
import os
+
+import pandas as pd
from six.moves import zip
try:
@@ -28,6 +30,8 @@
class BaseNumberDetector(object):
+ _SPAN_BOUNDARY_TEMPLATE = r'(?:^|(?<=[\s\"\'\,\-\?])){}(?=[\s\!\"\%\'\,\?\.\-]|$)'
+
def __init__(self, entity_name, data_directory_path, unit_type=None):
"""
Standard Number detection class, read data from language data path and help to detect number and numbers words
@@ -62,8 +66,7 @@ def __init__(self, entity_name, data_directory_path, unit_type=None):
# Variable to define default order in which detector will work
self.detector_preferences = [self._detect_number_from_digit,
- self._detect_number_from_words
- ]
+ self._detect_number_from_words]
def detect_number(self, text):
"""
@@ -224,23 +227,19 @@ def _detect_number_from_words(self, number_list=None, original_list=None):
for numeral_text in numeral_text_list:
numbers, original_texts = get_number_from_number_word(numeral_text, self.numbers_word_map)
full_list = list(zip(numbers, original_texts))
- """
- list() is added to above zip as in python 3, zip() returns a zip object instead of zip function and
- our lint checker is matching it for python 3
- """
sorted_full_list = sorted(full_list, key=lambda kv: len(kv[1]), reverse=True)
for number, original_text in sorted_full_list:
unit = None
if self.unit_type:
unit, original_text = self._get_unit_from_text(original_text, numeral_text)
- # numeral_text = numeral_text.replace(original_text, self.tag)
- _pattern = re.compile(r'\b%s\b' % re.escape(original_text), flags=_re_flags)
- numeral_text = _pattern.sub(self.tag, numeral_text)
- number_list.append({
- NUMBER_DETECTION_RETURN_DICT_VALUE: str(number),
- NUMBER_DETECTION_RETURN_DICT_UNIT: unit
- })
- original_list.append(original_text)
+ _pattern = re.compile(self._SPAN_BOUNDARY_TEMPLATE.format(re.escape(original_text)), flags=_re_flags)
+ if _pattern.search(numeral_text):
+ numeral_text = _pattern.sub(self.tag, numeral_text, 1)
+ number_list.append({
+ NUMBER_DETECTION_RETURN_DICT_VALUE: str(number),
+ NUMBER_DETECTION_RETURN_DICT_UNIT: unit
+ })
+ original_list.append(original_text)
return number_list, original_list
def _detect_number_from_digit(self, number_list=None, original_list=None):
@@ -294,12 +293,12 @@ def _detect_number_from_digit(self, number_list=None, original_list=None):
number, scale, original_text = None, None, None
if pattern[1] and pattern[1].replace(',', '').replace('.', '').isdigit():
number = pattern[1].replace(',', '')
- original_text = pattern[0].strip()
+ original_text = pattern[0].strip().strip(',.').strip()
scale = self.scale_map[pattern[2].strip()]
elif pattern[3] and pattern[3].replace(',', '').replace('.', '').isdigit():
number = pattern[3].replace(',', '')
- original_text = pattern[3].strip()
+ original_text = pattern[3].strip().strip(',.').strip()
scale = 1
if number:
@@ -308,13 +307,14 @@ def _detect_number_from_digit(self, number_list=None, original_list=None):
unit = None
if self.unit_type:
unit, original_text = self._get_unit_from_text(original_text, processed_text)
- _pattern = re.compile(r'\b%s\b' % re.escape(original_text), flags=_re_flags)
- processed_text = _pattern.sub(self.tag, processed_text)
- number_list.append({
- NUMBER_DETECTION_RETURN_DICT_VALUE: str(number),
- NUMBER_DETECTION_RETURN_DICT_UNIT: unit
- })
- original_list.append(original_text)
+ _pattern = re.compile(self._SPAN_BOUNDARY_TEMPLATE.format(re.escape(original_text)), flags=_re_flags)
+ if _pattern.search(processed_text):
+ processed_text = _pattern.sub(self.tag, processed_text, 1)
+ number_list.append({
+ NUMBER_DETECTION_RETURN_DICT_VALUE: str(number),
+ NUMBER_DETECTION_RETURN_DICT_UNIT: unit
+ })
+ original_list.append(original_text)
return number_list, original_list
@@ -330,9 +330,9 @@ def _update_processed_text(self, original_number_list):
created from entity_name
"""
for detected_text in original_number_list:
- _pattern = re.compile(r'\b%s\b' % re.escape(detected_text), flags=_re_flags)
- self.tagged_text = _pattern.sub(self.tag, self.tagged_text)
- self.processed_text = _pattern.sub('', self.processed_text)
+ _pattern = re.compile(self._SPAN_BOUNDARY_TEMPLATE.format(re.escape(detected_text)), flags=_re_flags)
+ self.tagged_text = _pattern.sub(self.tag, self.tagged_text, 1)
+ self.processed_text = _pattern.sub('', self.processed_text, 1)
class NumberDetector(BaseNumberDetector):
diff --git a/ner_v2/detectors/numeral/number_range/README.md b/ner_v2/detectors/numeral/number_range/README.md
index 9f3e32841..fb6e72266 100644
--- a/ner_v2/detectors/numeral/number_range/README.md
+++ b/ner_v2/detectors/numeral/number_range/README.md
@@ -254,8 +254,8 @@ class NumberRangeDetector(BaseNumberRangeDetector):
"""
number_range_list = number_range_list or []
original_list = original_list or []
- between_range_pattern = re.compile(ur'(between\s+({number}\d+)(?:\s+and|-)'
- ur'\s+({number}\d+))'.format(number=NUMBER_REPLACE_TEXT), re.UNICODE)
+ between_range_pattern = re.compile(ur'(between\s+({number}\d+__)(?:\s+and|-)'
+ ur'\s+({number}\d+__))'.format(number=NUMBER_REPLACE_TEXT), re.UNICODE)
number_range_matches = between_range_pattern.findall(self.processed_text)
for match in number_range_matches:
number_range, original_text = self._get_number_range(min_part_match=match[1], max_part_match=match[2],
diff --git a/ner_v2/detectors/numeral/number_range/en/number_range_detection.py b/ner_v2/detectors/numeral/number_range/en/number_range_detection.py
index 56b31a20c..779e0339c 100644
--- a/ner_v2/detectors/numeral/number_range/en/number_range_detection.py
+++ b/ner_v2/detectors/numeral/number_range/en/number_range_detection.py
@@ -24,8 +24,7 @@ def __init__(self, entity_name, language, unit_type=None):
self._detect_min_num_range_with_suffix_variants,
self._detect_max_num_range_with_prefix_variants,
self._detect_max_num_range_with_suffix_variants,
- self._detect_absolute_number
- ]
+ self._detect_absolute_number]
def _custom_num_range_between_num_and_num(self, number_range_list=None, original_list=None):
"""Detects number range of text of pattern between number1 to number2
@@ -42,8 +41,8 @@ def _custom_num_range_between_num_and_num(self, number_range_list=None, original
"""
number_range_list = number_range_list or []
original_list = original_list or []
- between_range_pattern = re.compile(u'(between\\s+({number}\\d+)(?:\\s+and|-)'
- u'\\s+({number}\\d+))'.format(number=NUMBER_REPLACE_TEXT), re.UNICODE)
+ between_range_pattern = re.compile(r'(between\s+({number}\d+__)\s+(?:and|-)'
+ r'\s+({number}\d+__))'.format(number=NUMBER_REPLACE_TEXT), re.UNICODE)
number_range_matches = between_range_pattern.findall(self.processed_text)
for match in number_range_matches:
number_range, original_text = self._get_number_range(min_part_match=match[1], max_part_match=match[2],
diff --git a/ner_v2/detectors/numeral/number_range/hi/data/number_range_keywords.csv b/ner_v2/detectors/numeral/number_range/hi/data/number_range_keywords.csv
index ac0f2e562..6b3bd32bb 100644
--- a/ner_v2/detectors/numeral/number_range/hi/data/number_range_keywords.csv
+++ b/ner_v2/detectors/numeral/number_range/hi/data/number_range_keywords.csv
@@ -3,4 +3,4 @@ ke upar| k upar| ke uper| k uper| se upar| se uper| se jada | se jyada | se adh
kam se kam| कम से कम,-1,min
jada se jada | jyada se jyada | lagbhag | ज्यादा से ज्यादा | जादा से जादा | लगभग,-1,max
se niche | se kam | se sasta | se saste |ke aaspas| ke aspas | k aaspas| k aspas | ke aas paas| ke aas pas| k aas paas| k aas pas|ke lagbhag| k lagbhag | से नीचे | से कम | से सस्ता | से सस्ते | के आसपास | के आस पास | के लगभग,1,max
-se|-|से,0,min_max
+se|-|–|से,0,min_max
diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py
index fa24c3d49..3da7677c1 100644
--- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py
+++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py
@@ -1,20 +1,25 @@
# coding=utf-8
from __future__ import absolute_import
-import pandas as pd
+
import collections
import os
+
+import pandas as pd
+from six.moves import zip
+
import ner_v2.detectors.numeral.constant as numeral_constant
-from ner_v2.detectors.numeral.utils import get_list_from_pipe_sep_string
from ner_v2.detectors.numeral.number.number_detection import NumberDetector
-from six.moves import zip
+from ner_v2.detectors.numeral.utils import get_list_from_pipe_sep_string
try:
import regex as re
+
_re_flags = re.UNICODE | re.V1 | re.WORD
except ImportError:
import re
+
_re_flags = re.UNICODE
NumberRangeVariant = collections.namedtuple('NumberRangeVariant', ['position', 'range_type'])
@@ -64,8 +69,7 @@ def __init__(self, entity_name, language, data_directory_path, unit_type=None):
self._detect_min_num_range_with_suffix_variants,
self._detect_max_num_range_with_prefix_variants,
self._detect_max_num_range_with_suffix_variants,
- self._detect_absolute_number
- ]
+ self._detect_absolute_number]
def _init_regex_for_range(self, data_directory_path):
"""
@@ -98,7 +102,7 @@ def _init_regex_for_range(self, data_directory_path):
self.min_range_suffix_variants = [re.escape(variant) for variant, value in self.range_variants_map.items()
if (value.position == 1 and
- value.range_type == numeral_constant.NUMBER_RANGE_MIN_TYPE)]
+ value.range_type == numeral_constant.NUMBER_RANGE_MIN_TYPE)]
self.max_range_prefix_variants = [re.escape(variant) for variant, value in self.range_variants_map.items()
if (value.position == -1 and
@@ -128,10 +132,11 @@ def _tag_number_in_text(self, processed_text):
"""
tagged_number_text = processed_text
sorted_number_detected_map = sorted(list(self.number_detected_map.items()),
- key=lambda kv: len(kv[1].original_text),
- reverse=True)
- for number_tag in sorted_number_detected_map:
- tagged_number_text = tagged_number_text.replace(number_tag[1].original_text, number_tag[0], 1)
+ key=lambda kv: len(kv[1].original_text), reverse=True)
+ span_template = self.number_detector.language_number_detector._SPAN_BOUNDARY_TEMPLATE
+ for number_tag, value_text_pair in sorted_number_detected_map:
+ tagged_number_text = re.sub(span_template.format(re.escape(value_text_pair.original_text)), number_tag,
+ tagged_number_text, count=1, flags=_re_flags)
return tagged_number_text
def _get_number_tag_dict(self):
@@ -148,8 +153,8 @@ def _get_number_tag_dict(self):
detected_number_dict = {}
entity_value_list, original_text_list = self.number_detector.detect_entity(self.processed_text)
for index, (entity_value, original_text) in enumerate(zip(entity_value_list, original_text_list)):
- detected_number_dict[numeral_constant.NUMBER_REPLACE_TEXT + str(index)] = ValueTextPair(
- entity_value=entity_value, original_text=original_text)
+ key = '{number}{index}__'.format(number=numeral_constant.NUMBER_REPLACE_TEXT, index=index)
+ detected_number_dict[key] = ValueTextPair(entity_value=entity_value, original_text=original_text)
return detected_number_dict
def _get_original_text_from_tagged_text(self, number_tag_text):
@@ -194,7 +199,7 @@ def detect_number_range(self, text):
def _detect_absolute_number(self, number_list, original_list):
number_list = number_list or []
original_list = original_list or []
- abs_number_pattern = re.compile(u'({number}\\d+)'.format(number=numeral_constant.NUMBER_REPLACE_TEXT),
+ abs_number_pattern = re.compile(r'({number}\d+__)'.format(number=numeral_constant.NUMBER_REPLACE_TEXT),
re.UNICODE)
abs_number_matches = abs_number_pattern.findall(self.processed_text)
for match in abs_number_matches:
@@ -282,7 +287,7 @@ def _detect_min_num_range_with_prefix_variants(self, number_range_list=None, ori
if self.min_range_prefix_variants:
min_prefix_choices = '|'.join(self.min_range_prefix_variants)
- min_range_start_pattern = re.compile(u'((?:{min_prefix_choices})\\s+({number}\\d+))'.format(
+ min_range_start_pattern = re.compile(r'((?:{min_prefix_choices})\s+({number}\d+__))'.format(
number=numeral_constant.NUMBER_REPLACE_TEXT, min_prefix_choices=min_prefix_choices), re.UNICODE)
number_range_matches = min_range_start_pattern.findall(self.processed_text)
for match in number_range_matches:
@@ -310,7 +315,7 @@ def _detect_min_num_range_with_suffix_variants(self, number_range_list=None, ori
if self.min_range_suffix_variants:
min_suffix_choices = '|'.join(self.min_range_suffix_variants)
- min_range_end_pattern = re.compile(u'(({number}\\d+)\\s+(?:{min_suffix_choices}))'.format(
+ min_range_end_pattern = re.compile(r'(({number}\d+__)\s+(?:{min_suffix_choices}))'.format(
number=numeral_constant.NUMBER_REPLACE_TEXT, min_suffix_choices=min_suffix_choices), re.UNICODE)
number_range_matches = min_range_end_pattern.findall(self.processed_text)
for match in number_range_matches:
@@ -340,7 +345,7 @@ def _detect_max_num_range_with_prefix_variants(self, number_range_list=None, ori
if self.max_range_prefix_variants:
max_prefix_choices = '|'.join(self.max_range_prefix_variants)
- max_range_start_pattern = re.compile(u'((?:{max_prefix_choices})\\s+({number}\\d+))'.format(
+ max_range_start_pattern = re.compile(r'((?:{max_prefix_choices})\s+({number}\d+__))'.format(
number=numeral_constant.NUMBER_REPLACE_TEXT, max_prefix_choices=max_prefix_choices), re.UNICODE)
number_range_matches = max_range_start_pattern.findall(self.processed_text)
for match in number_range_matches:
@@ -369,7 +374,7 @@ def _detect_max_num_range_with_suffix_variants(self, number_range_list=None, ori
if self.max_range_suffix_variants:
max_suffix_choices = '|'.join(self.max_range_suffix_variants)
- max_range_end_pattern = re.compile(u'(({number}\\d+)\\s+(?:{max_suffix_choices}))'.format(
+ max_range_end_pattern = re.compile(r'(({number}\d+__)\s+(?:{max_suffix_choices}))'.format(
number=numeral_constant.NUMBER_REPLACE_TEXT, max_suffix_choices=max_suffix_choices), re.UNICODE)
number_range_matches = max_range_end_pattern.findall(self.processed_text)
for match in number_range_matches:
@@ -399,9 +404,9 @@ def _detect_min_max_num_range(self, number_range_list=None, original_list=None):
if self.min_max_range_variants:
min_max_choices = '|'.join(self.min_max_range_variants)
- min_max_range_pattern = re.compile(u'(({number}\\d+)\\s*(?:{min_max_choices})\\s*'
- u'({number}\\d+))'.format(number=numeral_constant.NUMBER_REPLACE_TEXT,
- min_max_choices=min_max_choices), re.UNICODE)
+ min_max_range_pattern = re.compile(r'(({number}\d+__)\s*(?:{min_max_choices})\s*'
+ r'({number}\d+__))'.format(number=numeral_constant.NUMBER_REPLACE_TEXT,
+ min_max_choices=min_max_choices), re.UNICODE)
number_range_matches = min_max_range_pattern.findall(self.processed_text)
for match in number_range_matches:
number_range, original_text = self._get_number_range(min_part_match=match[1], max_part_match=match[2],
@@ -423,7 +428,7 @@ def _update_tagged_text(self, original_number_list):
created from entity_name
"""
for detected_text in original_number_list:
- _pattern = re.compile(u'\\b%s\\b' % re.escape(detected_text), flags=_re_flags)
+ _pattern = re.compile(r'\b%s\b' % re.escape(detected_text), flags=_re_flags)
self.tagged_text = _pattern.sub(self.tag, self.tagged_text)
diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py
index 684d8e254..77aadaa72 100644
--- a/ner_v2/detectors/temporal/time/en/time_detection.py
+++ b/ner_v2/detectors/temporal/time/en/time_detection.py
@@ -1246,7 +1246,7 @@ def _detect_24_hour_format(self, time_list=None, original_list=None):
time_list = []
if original_list is None:
original_list = []
- patterns = re.findall(r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]?([0-5][0-9])\s*({timezone})?)'
+ patterns = re.findall(r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]([0-5][0-9])\s*({timezone})?)'
r'(?!\s*(?:am|pm|a\.m\.?|p\.m\.?|(?:{timezone})|\d))'
.format(timezone=self.timezone_choices),
self.processed_text.lower())
@@ -1364,11 +1364,10 @@ def _detect_time_without_format_preceeding(self, time_list=None, original_list=N
r'({timezone})?)\b'.format(timezone=self.timezone_choices),
self.processed_text.lower())
- if not patterns and self.bot_message:
- if re.findall(r"Time|time", self.bot_message.lower()):
- patterns = re.findall(r'\b(({timezone})?\s*([0-2]?[0-9])'
- r'()\s*({timezone})?)\b'.format(timezone=self.timezone_choices),
- self.processed_text.lower())
+ if not patterns and self.bot_message and re.findall(r"Time|time", self.bot_message.lower()):
+ patterns = re.findall(r'\b(({timezone})?\s*([0-2]?[0-9])'
+ r'()\s*({timezone})?)\b'.format(timezone=self.timezone_choices),
+ self.processed_text.lower())
for pattern in patterns:
original = pattern[0].strip()
t1 = pattern[2]
@@ -1423,9 +1422,7 @@ def _get_meridiem(self, hours, mins, timezone):
return 'hrs'
if current_hour >= TWELVE_HOUR:
current_hour -= 12
- if current_hour < hours:
- return PM_MERIDIEM
- elif current_hour == hours and current_min < mins:
+ if (current_hour < hours) or (current_hour == hours and current_min < mins):
return PM_MERIDIEM
else:
if current_hour > hours:
@@ -1739,10 +1736,7 @@ def _remove_time_range_entities(self, time_list, original_list):
time_list_final = []
original_list_final = []
for i, entity in enumerate(time_list):
- if 'range' not in entity:
- time_list_final.append(entity)
- original_list_final.append(original_list[i])
- elif not entity['range']:
+ if not entity.get('range'):
time_list_final.append(entity)
original_list_final.append(original_list[i])
return time_list_final, original_list_final
diff --git a/ner_v2/tests/numeral/number/en/number_ner_tests.yaml b/ner_v2/tests/numeral/number/en/number_ner_tests.yaml
index 4e6e4bf43..24cfa727c 100644
--- a/ner_v2/tests/numeral/number/en/number_ner_tests.yaml
+++ b/ner_v2/tests/numeral/number/en/number_ner_tests.yaml
@@ -5,7 +5,7 @@ tests:
unit_type: null
outputs:
- output_id: 1
- value: 100
+ value: "100"
unit: null
original_text: "100"
- id: en_2
@@ -13,7 +13,7 @@ tests:
unit_type: currency
outputs:
- output_id: 1
- value: 100
+ value: "100"
unit: rupees
original_text: "rs.100"
- id: en_3
@@ -21,7 +21,7 @@ tests:
unit_type: null
outputs:
- output_id: 1
- value: 11.2
+ value: "11.2"
unit: null
original_text: "11.2"
- id: en_4
@@ -29,7 +29,7 @@ tests:
unit_type: currency
outputs:
- output_id: 1
- value: 10120
+ value: "10120"
unit: rupees
original_text: "10.12k rupees"
- id: en_5
@@ -37,7 +37,7 @@ tests:
unit_type: null
outputs:
- output_id: 1
- value: 1000
+ value: "1000"
unit: null
original_text: "1 thousand"
- id: en_6
@@ -45,7 +45,7 @@ tests:
unit_type: currency
outputs:
- output_id: 1
- value: 1000
+ value: "1000"
unit: rupees
original_text: "1 thousand rupees"
- id: en_7
@@ -53,7 +53,7 @@ tests:
unit_type: null
outputs:
- output_id: 1
- value: 2200
+ value: "2200"
unit: null
original_text: "2.2k"
- id: en_8
@@ -61,7 +61,7 @@ tests:
unit_type: currency
outputs:
- output_id: 1
- value: 2300
+ value: "2300"
unit: rupees
original_text: "2.3k rupees"
- id: en_9
@@ -72,3 +72,107 @@ tests:
value: null
unit: null
original_text: null
+ - id: en_10
+ message: "my alpha numeric code is 123ABC678DEF012, got it?"
+ unit_type: null
+ outputs:
+ - output_id: 1
+ value: null
+ unit: null
+ original_text: null
+ - id: en_11
+ message: "my alpha numeric code is 123 ABC 678 DEF 012, got it?"
+ unit_type: null
+ outputs:
+ - output_id: 1
+ value: "123"
+ unit: null
+ original_text: "123"
+ - output_id: 2
+ value: "678"
+ unit: null
+ original_text: "678"
+ - output_id: 3
+ value: "12"
+ unit: null
+ original_text: "012"
+ - id: en_12
+ message: "My name is Chirag Jain. The date is 28th Feb, 28/02/2021 next Monday 9:30 pm. This morning next weekday, tomorrow evening. emails are jain@abc.com chirag@example.com. yes no 1 2 3 12 123 1234 12345 123456 1234567890 918097678009 ALWPG5809L. My number is +911234567890. other number is 7123456789. The city is Mumbai, Maharashtra and Lucknow Delhi. jio phone more Rs. 10. 500 - 1000 rupees. less than 50000 rupees and 3 children. prod01 pillows https://haptik.ai a-b 1-2 a ab active"
+ unit_type: null
+ outputs:
+ - original_text: "1"
+ output_id: 1
+ unit: null
+ value: "1"
+ - original_text: "2"
+ output_id: 2
+ unit: null
+ value: "2"
+ - original_text: "3"
+ output_id: 3
+ unit: null
+ value: "3"
+ - original_text: "12"
+ output_id: 4
+ unit: null
+ value: "12"
+ - original_text: "123"
+ output_id: 5
+ unit: null
+ value: "123"
+ - original_text: "1234"
+ output_id: 6
+ unit: null
+ value: "1234"
+ - original_text: "12345"
+ output_id: 7
+ unit: null
+ value: "12345"
+ - original_text: "123456"
+ output_id: 8
+ unit: null
+ value: "123456"
+ - original_text: "10"
+ output_id: 9
+ unit: null
+ value: "10"
+ - original_text: "500"
+ output_id: 10
+ unit: null
+ value: "500"
+ - original_text: "1000"
+ output_id: 11
+ unit: null
+ value: "1000"
+ - original_text: "50000"
+ output_id: 12
+ unit: null
+ value: "50000"
+ - original_text: "3"
+ output_id: 13
+ unit: null
+ value: "3"
+ - original_text: "1"
+ output_id: 14
+ unit: null
+ value: "1"
+ - original_text: "2"
+ output_id: 15
+ unit: null
+ value: "2"
+ - id: en_13
+ message: "My name is Chirag Jain. The date is 28th Feb, 28/02/2021 next Monday 9:30 pm. This morning next weekday, tomorrow evening. emails are jain@abc.com chirag@example.com. yes no 1 2 3 12 123 1234 12345 123456 1234567890 918097678009 ALWPG5809L. My number is +911234567890. other number is 7123456789. The city is Mumbai, Maharashtra and Lucknow Delhi. jio phone more Rs. 10. 500 - 1000 rupees. less than 50000 rupees and 3 children. prod01 pillows https://haptik.ai a-b 1-2 a ab active"
+ unit_type: currency
+ outputs:
+ - original_text: "rs. 10"
+ output_id: 1
+ unit: rupees
+ value: "10"
+ - original_text: "1000 rupees"
+ output_id: 2
+ unit: rupees
+ value: "1000"
+ - original_text: "50000 rupees"
+ output_id: 3
+ unit: rupees
+ value: "50000"
\ No newline at end of file
diff --git a/ner_v2/tests/numeral/number/en/test_number_detection.py b/ner_v2/tests/numeral/number/en/test_number_detection.py
index 8511405c0..bcd834583 100644
--- a/ner_v2/tests/numeral/number/en/test_number_detection.py
+++ b/ner_v2/tests/numeral/number/en/test_number_detection.py
@@ -147,6 +147,9 @@ class NumberDetectorTestMeta(type):
def __new__(cls, name, bases, attrs):
for test_name, test_fn in cls.yaml_testsuite_generator():
+ if test_name in attrs:
+ raise ValueError('Got duplicate test name {test_name}, please make sure all tests have unique "id"'
+ .format(test_name=test_name))
attrs[test_name] = test_fn
return super(NumberDetectorTestMeta, cls).__new__(cls, name, bases, attrs)
diff --git a/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml b/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml
index 5d7058747..3a749ae45 100644
--- a/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml
+++ b/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml
@@ -4,7 +4,7 @@ tests:
message: "I want more than 200 banana"
outputs:
- max_value: null
- min_value: '200'
+ min_value: "200"
original_text: "more than 200"
output_id: 1
unit: null
@@ -14,7 +14,7 @@ tests:
message: "My monthly salary will be more than 2k per month"
outputs:
- max_value: null
- min_value: '2000'
+ min_value: "2000"
original_text: "more than 2k"
output_id: 1
unit: null
@@ -24,7 +24,7 @@ tests:
message: "more than 2.5k people in the stadium"
outputs:
- max_value: null
- min_value: '2500'
+ min_value: "2500"
original_text: "more than 2.5k"
output_id: 1
unit: null
@@ -64,7 +64,7 @@ tests:
message: "more than 200 rupees"
outputs:
- max_value: null
- min_value: '200'
+ min_value: "200"
original_text: "more than 200 rupees"
output_id: 1
unit: rupees
@@ -74,7 +74,7 @@ tests:
message: "more than 2k rupees"
outputs:
- max_value: null
- min_value: '2000'
+ min_value: "2000"
original_text: "more than 2k rupees"
output_id: 1
unit: rupees
@@ -84,7 +84,7 @@ tests:
message: "more than 2.5k rupees"
outputs:
- max_value: null
- min_value: '2500'
+ min_value: "2500"
original_text: "more than 2.5k rupees"
output_id: 1
unit: rupees
@@ -93,8 +93,8 @@ tests:
- id: en_10
message: "200 to 300"
outputs:
- - max_value: '300'
- min_value: '200'
+ - max_value: "300"
+ min_value: "200"
original_text: "200 to 300"
output_id: 1
unit: null
@@ -103,8 +103,8 @@ tests:
- id: en_11
message: "200 - 300"
outputs:
- - max_value: '300'
- min_value: '200'
+ - max_value: "300"
+ min_value: "200"
original_text: "200 - 300"
output_id: 1
unit: null
@@ -113,8 +113,8 @@ tests:
- id: en_12
message: "200-300"
outputs:
- - max_value: '300'
- min_value: '200'
+ - max_value: "300"
+ min_value: "200"
original_text: "200-300"
output_id: 1
unit: null
@@ -150,197 +150,341 @@ tests:
unit: null
abs_value: null
unit_type: currency
-# - id: en_16
-# message: "200 to 300 ruppes"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200 to 300 ruppes"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: en_17
-# message: "200 – 300 rupees"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200 – 300 rupees"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: en_18
-# message: "200-300 rupees"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200-300 rupees"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: en_19
-# message: "200 rupees to 300"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200 rupees to 300"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: en_20
-# message: "200 rupees to 300 rupees"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200 rupees to 300 rupees"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: en_21
-# message: "200 rupees – 300"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200 rupees – 300"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: en_22
-# message: "200 rupees – 300 rupees"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200 rupees – 300 rupees"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: en_23
-# message: "200-300 rupees"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200-300 rupees"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: en_24
-# message: "200k-300k men and around 400 women"
-# outputs:
-# - max_value: 300000
-# min_value: 200000
-# original_text: "200k-300k"
-# output_id: 1
-# unit: null
-# - max_value: 400
-# min_value: null
-# original_text: "around 400"
-# output_id: 2
-# unit: null
-# unit_type: null
-# - id: en_25
-# message: "200k-300k men and around 300k women"
-# outputs:
-# - max_value: 300000
-# min_value: 200000
-# original_text: "200k-300k"
-# output_id: 1
-# unit: null
-# - max_value: 300000
-# min_value: null
-# original_text: "around 300k"
-# output_id: 2
-# unit: null
-# unit_type: null
-# - id: en_26
-# message: "between 2000 and 3000"
-# outputs:
-# - max_value: 3000
-# min_value: 2000
-# original_text: "between 2000 and 3000"
-# output_id: 1
-# unit: null
-# unit_type: null
+ - id: en_16
+ message: "200 to 300 rupees"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200 to 300 rupees"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: en_17
+ message: "200 - 300 rupees"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200 - 300 rupees"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: en_18
+ message: "200-300 rupees"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200-300 rupees"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: en_19
+ message: "200 rupees to 300"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200 rupees to 300"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: en_20
+ message: "200 rupees to 300 rupees"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200 rupees to 300 rupees"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: en_21
+ message: "200 rupees - 300"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200 rupees - 300"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: en_22
+ message: "200 rupees - 300 rupees"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200 rupees - 300 rupees"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: en_23
+ message: "200-300 rupees"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200-300 rupees"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: en_24
+ message: "200k-300k men and around 400 women"
+ outputs:
+ - max_value: "300000"
+ min_value: "200000"
+ original_text: "200k-300k"
+ output_id: 1
+ unit: null
+ abs_value: null
+ - max_value: "400"
+ min_value: null
+ original_text: "around 400"
+ output_id: 2
+ unit: null
+ abs_value: null
+ unit_type: null
+ - id: en_25
+ message: "200k-300k men and around 300k women"
+ outputs:
+ - max_value: "300000"
+ min_value: "200000"
+ original_text: "200k-300k"
+ output_id: 1
+ unit: null
+ abs_value: null
+ - max_value: "300000"
+ min_value: null
+ original_text: "around 300k"
+ output_id: 2
+ unit: null
+ abs_value: null
+ unit_type: null
+ - id: en_26
+ message: "between 2000 and 3000"
+ outputs:
+ - max_value: "3000"
+ min_value: "2000"
+ original_text: "between 2000 and 3000"
+ output_id: 1
+ unit: null
+ abs_value: null
+ unit_type: null
+ - id: en_27
+ message: "My name is Chirag Jain. The date is 28th Feb, 28/02/2021 next Monday 9:30 pm. This morning next weekday, tomorrow evening. emails are jain@abc.com chirag@example.com. yes no 1 2 3 12 123 1234 12345 123456 1234567890 918097678009 ALWPG5809L. My number is +911234567890. other number is 7123456789. The city is Mumbai, Maharashtra and Lucknow Delhi. jio phone more Rs. 10. 500 - 1000 rupees. less than 50000 rupees and 3 children. prod01 pillows https://haptik.ai a-b 1-2 a ab active"
+ unit_type: null
+ outputs:
+ - abs_value: null
+ max_value: "1000"
+ min_value: "500"
+ original_text: "500 - 1000"
+ output_id: 1
+ unit: null
+ - abs_value: null
+ max_value: "2"
+ min_value: "1"
+ original_text: "1-2"
+ output_id: 2
+ unit: null
+ - abs_value: null
+ max_value: "50000"
+ min_value: null
+ original_text: "less than 50000"
+ output_id: 3
+ unit: null
+ - abs_value: "1"
+ max_value: null
+ min_value: null
+ original_text: "1"
+ output_id: 4
+ unit: null
+ - abs_value: "2"
+ max_value: null
+ min_value: null
+ original_text: "2"
+ output_id: 5
+ unit: null
+ - abs_value: "3"
+ max_value: null
+ min_value: null
+ original_text: "3"
+ output_id: 6
+ unit: null
+ - abs_value: "12"
+ max_value: null
+ min_value: null
+ original_text: "12"
+ output_id: 7
+ unit: null
+ - abs_value: "123"
+ max_value: null
+ min_value: null
+ original_text: "123"
+ output_id: 8
+ unit: null
+ - abs_value: "1234"
+ max_value: null
+ min_value: null
+ original_text: "1234"
+ output_id: 9
+ unit: null
+ - abs_value: "12345"
+ max_value: null
+ min_value: null
+ original_text: "12345"
+ output_id: 10
+ unit: null
+ - abs_value: "123456"
+ max_value: null
+ min_value: null
+ original_text: "123456"
+ output_id: 11
+ unit: null
+ - abs_value: "1234567890"
+ max_value: null
+ min_value: null
+ original_text: "1234567890"
+ output_id: 12
+ unit: null
+ - abs_value: "918097678009"
+ max_value: null
+ min_value: null
+ original_text: "918097678009"
+ output_id: 13
+ unit: null
+ - abs_value: "7123456789"
+ max_value: null
+ min_value: null
+ original_text: "7123456789"
+ output_id: 14
+ unit: null
+ - abs_value: "10"
+ max_value: null
+ min_value: null
+ original_text: "10"
+ output_id: 15
+ unit: null
+ - abs_value: "3"
+ max_value: null
+ min_value: null
+ original_text: "3"
+ output_id: 16
+ unit: null
+ - id: en_28
+ message: "My name is Chirag Jain. The date is 28th Feb, 28/02/2021 next Monday 9:30 pm. This morning next weekday, tomorrow evening. emails are jain@abc.com chirag@example.com. yes no 1 2 3 12 123 1234 12345 123456 1234567890 918097678009 ALWPG5809L. My number is +911234567890. other number is 7123456789. The city is Mumbai, Maharashtra and Lucknow Delhi. jio phone more Rs. 10. 500 - 1000 rupees. less than 50000 rupees and 3 children. prod01 pillows https://haptik.ai a-b 1-2 a ab active"
+ unit_type: currency
+ outputs:
+ - abs_value: null
+ max_value: "1000"
+ min_value: "500"
+ original_text: "500 - 1000 rupees"
+ output_id: 1
+ unit: rupees
+ - abs_value: null
+ max_value: "50000"
+ min_value: null
+ original_text: "less than 50000 rupees"
+ output_id: 2
+ unit: rupees
+ - abs_value: "10"
+ max_value: null
+ min_value: null
+ original_text: "rs. 10"
+ output_id: 3
+ unit: rupees
hi:
-# - id: hi_1
-# message: "200 se jyada"
-# outputs:
-# - max_value: null
-# min_value: 200
-# original_text: "200 se jyada"
-# output_id: 1
-# unit: null
-# unit_type: null
-# - id: hi_2
-# message: "2k se upar"
-# outputs:
-# - max_value: null
-# min_value: 2000
-# original_text: "2k se upar"
-# output_id: 1
-# unit: null
-# unit_type: null
-# - id: hi_3
-# message: "jada se jada 2500"
-# outputs:
-# - max_value: 2500
-# min_value: null
-# original_text: "jada se jada 2500"
-# output_id: 1
-# unit: null
-# unit_type: null
-# - id: hi_7
-# message: "200 rupees se jyada"
-# outputs:
-# - max_value: null
-# min_value: 200
-# original_text: "200 rupees se jyada"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_8
-# message: "Rupees 2000 se upar"
-# outputs:
-# - max_value: null
-# min_value: 2000
-# original_text: "Rupees 2000 se upar"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_9
-# message: "jada se jada 2500 rupees"
-# outputs:
-# - max_value: 2500
-# min_value: null
-# original_text: "jada se jada 2500 rupees"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_10
-# message: "200 se 300"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200 se 300"
-# output_id: 1
-# unit: null
-# unit_type: null
-# - id: hi_11
-# message: "200 – 300"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200 – 300"
-# output_id: 1
-# unit: null
-# unit_type: null
-# - id: hi_12
-# message: "200-300"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200-300"
-# output_id: 1
-# unit: null
-# unit_type: null
+ - id: hi_1
+ message: "200 se jyada"
+ outputs:
+ - max_value: null
+ min_value: "200"
+ original_text: "200 se jyada"
+ output_id: 1
+ unit: null
+ abs_value: null
+ unit_type: null
+ - id: hi_2
+ message: "2k se upar"
+ outputs:
+ - max_value: null
+ min_value: "2000"
+ original_text: "2k se upar"
+ output_id: 1
+ unit: null
+ abs_value: null
+ unit_type: null
+ - id: hi_3
+ message: "jada se jada 2500"
+ outputs:
+ - max_value: "2500"
+ min_value: null
+ original_text: "jada se jada 2500"
+ output_id: 1
+ unit: null
+ abs_value: null
+ unit_type: null
+ - id: hi_7
+ message: "200 rupees se jyada"
+ outputs:
+ - max_value: null
+ min_value: "200"
+ original_text: "200 rupees se jyada"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_8
+ message: "Rupees 2000 se upar"
+ outputs:
+ - max_value: null
+ min_value: "2000"
+ original_text: "Rupees 2000 se upar"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_9
+ message: "jada se jada 2500 rupees"
+ outputs:
+ - max_value: "2500"
+ min_value: null
+ original_text: "jada se jada 2500 rupees"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_10
+ message: "200 se 300"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200 se 300"
+ output_id: 1
+ unit: null
+ abs_value: null
+ unit_type: null
+ - id: hi_11
+ message: "200 - 300"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200 - 300"
+ output_id: 1
+ unit: null
+ abs_value: null
+ unit_type: null
+ - id: hi_12
+ message: "200-300"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200-300"
+ output_id: 1
+ unit: null
+ abs_value: null
+ unit_type: null
- id: hi_13
message: "200 se 300"
outputs:
@@ -371,105 +515,116 @@ tests:
unit: null
abs_value: null
unit_type: currency
-# - id: hi_16
-# message: "200 se 300 rupees"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200 se 300 rupees"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_17
-# message: "200 – 300 rupees"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200 – 300 rupees"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_18
-# message: "200-300 rupees"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200-300 rupees"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_19
-# message: "200 rupees se 300"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200 rupees se 300"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_20
-# message: "200 rupees se 300 rupees"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200 rupees se 300 rupees"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_21
-# message: "200 rupees – 300"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200 rupees – 300"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_22
-# message: "200 rupees – 300 rupees"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200 rupees – 300 rupees"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_23
-# message: "200-300 rupees"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "200-300 rupees"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_24
-# message: "२०० से ज्यादा"
-# outputs:
-# - max_value: null
-# min_value: 200
-# original_text: "२०० से ज्यादा"
-# output_id: 1
-# unit: null
-# unit_type: null
-# - id: hi_25
-# message: "२ हजार से ऊपर"
-# outputs:
-# - max_value: null
-# min_value: 2000
-# original_text: "२ हजार से ऊपर"
-# output_id: 1
-# unit: null
-# unit_type: null
-# - id: hi_26
-# message: "ज्यादा से ज्यादा २ हजार"
-# outputs:
-# - max_value: 2000
-# min_value: null
-# original_text: "ज्यादा से ज्यादा २ हजार"
-# output_id: 1
-# unit: null
-# unit_type: null
+ - id: hi_16
+ message: "200 se 300 rupees"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200 se 300 rupees"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_17
+ message: "200 - 300 rupees"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200 - 300 rupees"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_18
+ message: "200-300 rupees"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200-300 rupees"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_19
+ message: "200 rupees se 300"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200 rupees se 300"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_20
+ message: "200 rupees se 300 rupees"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200 rupees se 300 rupees"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_21
+ message: "200 rupees - 300"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200 rupees - 300"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_22
+ message: "200 rupees - 300 rupees"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200 rupees - 300 rupees"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_23
+ message: "200-300 rupees"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "200-300 rupees"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_24
+ message: "२०० से ज्यादा"
+ outputs:
+ - max_value: null
+ min_value: "200"
+ original_text: "२०० से ज्यादा"
+ output_id: 1
+ unit: null
+ abs_value: null
+ unit_type: null
+ - id: hi_25
+ message: "२ हजार से ऊपर"
+ outputs:
+ - max_value: null
+ min_value: "2000"
+ original_text: "२ हजार से ऊपर"
+ output_id: 1
+ unit: null
+ abs_value: null
+ unit_type: null
+ - id: hi_26
+ message: "ज्यादा से ज्यादा २ हजार"
+ outputs:
+ - max_value: "2000"
+ min_value: null
+ original_text: "ज्यादा से ज्यादा २ हजार"
+ output_id: 1
+ unit: null
+ abs_value: null
+ unit_type: null
- id: hi_27
message: "२०० से ज्यादा"
outputs:
@@ -500,60 +655,66 @@ tests:
unit: null
abs_value: null
unit_type: currency
-# - id: hi_30
-# message: "२०० रूपीस से ज्यादा"
-# outputs:
-# - max_value: null
-# min_value: 200
-# original_text: "२०० रूपीस से ज्यादा"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_31
-# message: "रूपीस २ हजार से ऊपर"
-# outputs:
-# - max_value: null
-# min_value: 2000
-# original_text: रूपीस २ हजार से ऊपर
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_32
-# message: "ज्यादा से ज्यादा ५ हजार रुपया"
-# outputs:
-# - max_value: 5000
-# min_value: null
-# original_text: ज्यादा से ज्यादा ५ हजार रुपया
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_33
-# message: "२०० से ३००"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: २०० से ३००
-# output_id: 1
-# unit: null
-# unit_type: null
-# - id: hi_34
-# message: "२०० – ३००"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "२०० – ३००"
-# output_id: 1
-# unit: null
-# unit_type: null
-# - id: hi_35
-# message: "२००-३००"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "२००-३००"
-# output_id: 1
-# unit: null
-# unit_type: null
+ - id: hi_30
+ message: "२०० रूपीस से ज्यादा"
+ outputs:
+ - max_value: null
+ min_value: "200"
+ original_text: "२०० रूपीस से ज्यादा"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_31
+ message: "रूपीस २ हजार से ऊपर"
+ outputs:
+ - max_value: null
+ min_value: "2000"
+ original_text: रूपीस २ हजार से ऊपर
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_32
+ message: "ज्यादा से ज्यादा ५ हजार रुपया"
+ outputs:
+ - max_value: "5000"
+ min_value: null
+ original_text: ज्यादा से ज्यादा ५ हजार रुपया
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_33
+ message: "२०० से ३००"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: २०० से ३००
+ output_id: 1
+ unit: null
+ abs_value: null
+ unit_type: null
+ - id: hi_34
+ message: "२०० - ३००"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "२०० - ३००"
+ output_id: 1
+ unit: null
+ abs_value: null
+ unit_type: null
+ - id: hi_35
+ message: "२००-३००"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "२००-३००"
+ output_id: 1
+ unit: null
+ abs_value: null
+ unit_type: null
- id: hi_36
message: "२०० से ३००"
outputs:
@@ -584,75 +745,83 @@ tests:
unit: null
abs_value: null
unit_type: currency
-# - id: hi_39
-# message: "२०० से ३०० रुपया"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "२०० से ३०० रुपया"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_40
-# message: "२००-३०० रुपया"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "२००-३०० रुपया"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_41
-# message: "२०० रुपया से ३००"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "२०० रुपया से ३००"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_42
-# message: "२०० रुपया से ३०० रुपया"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "२०० रुपया से ३०० रुपया"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_43
-# message: "२०० - ३०० रुपया"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "२०० - ३०० रुपया"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_44
-# message: "२०० रुपया - ३००"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "२०० रुपया - ३००"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_45
-# message: "२०० रुपया - ३०० रुपया "
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "२०० रुपया - ३०० रुपया"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
-# - id: hi_46
-# message: "२००-३०० रुपया"
-# outputs:
-# - max_value: 300
-# min_value: 200
-# original_text: "२००-३०० रुपया"
-# output_id: 1
-# unit: rupees
-# unit_type: currency
+ - id: hi_39
+ message: "२०० से ३०० रुपया"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "२०० से ३०० रुपया"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_40
+ message: "२००-३०० रुपया"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "२००-३०० रुपया"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_41
+ message: "२०० रुपया से ३००"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "२०० रुपया से ३००"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_42
+ message: "२०० रुपया से ३०० रुपया"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "२०० रुपया से ३०० रुपया"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_43
+ message: "२०० - ३०० रुपया"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "२०० - ३०० रुपया"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_44
+ message: "२०० रुपया - ३००"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "२०० रुपया - ३००"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_45
+ message: "२०० रुपया - ३०० रुपया "
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "२०० रुपया - ३०० रुपया"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
+ - id: hi_46
+ message: "२००-३०० रुपया"
+ outputs:
+ - max_value: "300"
+ min_value: "200"
+ original_text: "२००-३०० रुपया"
+ output_id: 1
+ unit: rupees
+ abs_value: null
+ unit_type: currency
diff --git a/ner_v2/tests/numeral/number_range/test_number_range_detection.py b/ner_v2/tests/numeral/number_range/test_number_range_detection.py
index 4c25a5059..a05c4dcc1 100644
--- a/ner_v2/tests/numeral/number_range/test_number_range_detection.py
+++ b/ner_v2/tests/numeral/number_range/test_number_range_detection.py
@@ -15,6 +15,9 @@ class NumberRangeDetectorTestMeta(type):
def __new__(cls, name, bases, attrs):
for test_name, test_fn in cls.yaml_testsuite_generator():
+ if test_name in attrs:
+ raise ValueError('Got duplicate test name {test_name}, please make sure all tests have unique "id"'
+ .format(test_name=test_name))
attrs[test_name] = test_fn
return super(NumberRangeDetectorTestMeta, cls).__new__(cls, name, bases, attrs)
diff --git a/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py b/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py
index 863243dcf..25b9c9c84 100644
--- a/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py
+++ b/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py
@@ -15,6 +15,9 @@ class PhoneNumberDetectorTestMeta(type):
def __new__(cls, name, bases, attrs):
for test_name, test_fn in cls.yaml_testsuite_generator():
+ if test_name in attrs:
+ raise ValueError('Got duplicate test name {test_name}, please make sure all tests have unique "id"'
+ .format(test_name=test_name))
attrs[test_name] = test_fn
return super(PhoneNumberDetectorTestMeta, cls).__new__(cls, name, bases, attrs)
diff --git a/ner_v2/tests/temporal/time/test_time_detection.py b/ner_v2/tests/temporal/time/test_time_detection.py
index 254619141..70b6bea3b 100644
--- a/ner_v2/tests/temporal/time/test_time_detection.py
+++ b/ner_v2/tests/temporal/time/test_time_detection.py
@@ -18,6 +18,9 @@ class TimeDetectionTestMeta(type):
def __new__(cls, name, bases, attrs):
for test_name, test_fn in cls.yaml_testsuite_generator():
+ if test_name in attrs:
+ raise ValueError('Got duplicate test name {test_name}, please make sure all tests have unique "id"'
+ .format(test_name=test_name))
attrs[test_name] = test_fn
return super(TimeDetectionTestMeta, cls).__new__(cls, name, bases, attrs)
@@ -45,7 +48,7 @@ def parse_expected_outputs(expected_outputs):
"hh": expected_output["hh"],
"mm": expected_output["mm"],
"nn": expected_output["nn"],
- 'tz': expected_output["tz"],
+ "tz": expected_output["tz"],
"range": expected_output["range"],
"time_type": expected_output["time_type"]
}
diff --git a/ner_v2/tests/temporal/time/time_ner_tests.yaml b/ner_v2/tests/temporal/time/time_ner_tests.yaml
index 6c84a7282..557c5bab7 100644
--- a/ner_v2/tests/temporal/time/time_ner_tests.yaml
+++ b/ner_v2/tests/temporal/time/time_ner_tests.yaml
@@ -804,6 +804,19 @@ tests:
range: null
time_type: null
original_text: "once in 3 days"
+ - id: en_62
+ message: "your three digit code is 145 and 4 digit code is 1230. Please enter this on the app. 12 is the two digit code"
+ bot_message: null
+ range_enabled: false
+ outputs:
+ - hh: null
+ mm: null
+ nn: null
+ tz: null
+ original_text: null
+ output_id: 1
+ range: null
+ time_type: null
hi:
- id: hi_1
diff --git a/postman_tests/data/entities/numberV2.json b/postman_tests/data/entities/numberV2.json
index a65496b04..4e581df4d 100644
--- a/postman_tests/data/entities/numberV2.json
+++ b/postman_tests/data/entities/numberV2.json
@@ -429,8 +429,7 @@
},
"expected": [
{
- "original_text": "6754321",
- "value": "6754321"
+ "data": null
}
]
},
diff --git a/requirements.txt b/requirements.txt
index 13ecaa3bb..ac005bf35 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,24 +11,25 @@ Django==1.11.29
django-dotenv==1.4.2
weighted-levenshtein==0.1
regex==2018.7.11
-ipython
word2number==1.1
python-crfsuite==0.9.6
boto==2.49.0
boto3==1.8.4
python-dateutil==2.7.3
pandas==0.19.0
-mock==2.0.0
-django-nose==1.4.5
-typing==3.6.2
-flake8==3.4.1
pyaml==19.4.1
-coverage==4.5.3
-nose-exclude==0.5.0
spacy==2.3.2
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz#egg=en_core_web_sm
https://github.com/explosion/spacy-models/releases/download/nl_core_news_sm-2.3.0/nl_core_news_sm-2.3.0.tar.gz#egg=nl_core_news_sm
https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.3.0/fr_core_news_sm-2.3.0.tar.gz#egg=fr_core_news_sm
https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.3.0/de_core_news_sm-2.3.0.tar.gz#egg=de_core_news_sm
https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.3.1/es_core_news_sm-2.3.1.tar.gz#egg=es_core_news_sm
-sentry-sdk==0.14.1
+typing==3.6.2
+flake8==3.4.1
+mock==2.0.0
+coverage==5.5
+nose-exclude==0.5.0
+django-nose==1.4.7
+sentry-sdk==0.20.3
+jedi==0.17.2
+ipython==7.16.1