From 8c6bd8f56723963f4ea833d54ba90037df1e997b Mon Sep 17 00:00:00 2001 From: Dhananjay Date: Fri, 7 Jan 2022 03:25:40 +0530 Subject: [PATCH 1/5] Added ignore feature to parse function --- number_parser/parser.py | 99 ++++++++++++++++++++++++++++------------- 1 file changed, 69 insertions(+), 30 deletions(-) diff --git a/number_parser/parser.py b/number_parser/parser.py index e0d67c2..13739d7 100644 --- a/number_parser/parser.py +++ b/number_parser/parser.py @@ -298,7 +298,7 @@ def parse_fraction(input_string, language=None): return None -def parse(input_string, language=None): +def parse(input_string, language=None, ignore=None): """ Converts all the numbers in a sentence written in natural language to their numeric type while keeping the other words unchanged. Returns the transformed string. @@ -326,39 +326,78 @@ def _build_and_add_number(pop_last_space=False): current_sentence.pop() for token in tokens: - compare_token = _strip_accents(token.lower()) - ordinal_number = _is_ordinal_token(compare_token, lang_data) - - if not compare_token.strip(): - if not tokens_taken: + if ignore: + if token in ignore: + _build_and_add_number() current_sentence.append(token) - continue + else: + compare_token = _strip_accents(token.lower()) + ordinal_number = _is_ordinal_token(compare_token, lang_data) + + if not compare_token.strip(): + if not tokens_taken: + current_sentence.append(token) + continue + + if compare_token in SENTENCE_SEPARATORS: + _build_and_add_number(pop_last_space=True) + current_sentence.append(token) + final_sentence.extend(current_sentence) + current_sentence = [] + continue + + if ordinal_number: + tokens_taken.append(ordinal_number) + _build_and_add_number(pop_last_space=True) + elif ( + _is_cardinal_token(compare_token, lang_data) + or (_is_skip_token(compare_token, lang_data) and len(tokens_taken) != 0) + ): + tokens_taken.append(compare_token) + else: + if tokens_taken and _is_skip_token(tokens_taken[-1], lang_data): + # when finishing with a skip_token --> keep it + skip_token = tokens_taken[-1] + tokens_taken.pop() + _build_and_add_number() + current_sentence.extend([skip_token, " "]) + + _build_and_add_number() + current_sentence.append(token) + else: + compare_token = _strip_accents(token.lower()) + ordinal_number = _is_ordinal_token(compare_token, lang_data) - if compare_token in SENTENCE_SEPARATORS: - _build_and_add_number(pop_last_space=True) - current_sentence.append(token) - final_sentence.extend(current_sentence) - current_sentence = [] - continue + if not compare_token.strip(): + if not tokens_taken: + current_sentence.append(token) + continue - if ordinal_number: - tokens_taken.append(ordinal_number) - _build_and_add_number(pop_last_space=True) - elif ( - _is_cardinal_token(compare_token, lang_data) - or (_is_skip_token(compare_token, lang_data) and len(tokens_taken) != 0) - ): - tokens_taken.append(compare_token) - else: - if tokens_taken and _is_skip_token(tokens_taken[-1], lang_data): - # when finishing with a skip_token --> keep it - skip_token = tokens_taken[-1] - tokens_taken.pop() - _build_and_add_number() - current_sentence.extend([skip_token, " "]) + if compare_token in SENTENCE_SEPARATORS: + _build_and_add_number(pop_last_space=True) + current_sentence.append(token) + final_sentence.extend(current_sentence) + current_sentence = [] + continue + + if ordinal_number: + tokens_taken.append(ordinal_number) + _build_and_add_number(pop_last_space=True) + elif ( + _is_cardinal_token(compare_token, lang_data) + or (_is_skip_token(compare_token, lang_data) and len(tokens_taken) != 0) + ): + tokens_taken.append(compare_token) + else: + if tokens_taken and _is_skip_token(tokens_taken[-1], lang_data): + # when finishing with a skip_token --> keep it + skip_token = tokens_taken[-1] + tokens_taken.pop() + _build_and_add_number() + current_sentence.extend([skip_token, " "]) - _build_and_add_number() - current_sentence.append(token) + _build_and_add_number() + current_sentence.append(token) _build_and_add_number() From f79ff0622b4cfb39fcec8b7838be8d360902f920 Mon Sep 17 00:00:00 2001 From: Dhananjay Date: Fri, 7 Jan 2022 03:28:10 +0530 Subject: [PATCH 2/5] Added few tests for number parsing including ignore case --- tests/test_number_parsing.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/test_number_parsing.py b/tests/test_number_parsing.py index 2de6d09..e824c86 100644 --- a/tests/test_number_parsing.py +++ b/tests/test_number_parsing.py @@ -121,6 +121,31 @@ def test_parse_sentences_ordinal(expected, test_input, lang): assert parse(test_input, lang) == expected +@pytest.mark.parametrize( + "test_input,expected,lang,ignore", + [ + ('fifty fifth sixty seventh', 'fifty 5 67', 'en', ['fifty','seven']), + # en + ('Two thousand sentences', '2 thousand sentences', 'en', ['thousand']), + ('twenty one', '20 one', 'en', ['one']), + ('I have three apples and one pear.', 'I have three apples and 1 pear.', 'en', ['three']), + # numeric + ('eleven', 'eleven', 'en', ['eleven']), + ('one hundred and forty two', 'one 140 two', 'en', ['one','two']), + ('one hundred and one', 'one 100 one', 'en', ['one']), + ('ignore this sentence', 'ignore this sentence', 'en', ['ignore']), + ('five hundred sixty seven thousand twenty four', 'five 167020 four', 'en', ['fifty','five','four']), + ('one million four hundred twenty-three thousand nine hundred twenty-two', '1000400 twenty-3900 twenty-two', 'en', ['two','twenty']), + ('nine hundred ninety-nine thousand nine hundred ninety-nine', 'nine 190 nine 1000 nine 190 nine', 'en', ['nine']), + ('one million fifty thousand', '1000000 fifty 1000', 'en', ['fifty']), + ('two billion one hundred forty seven million four hundred eighty three thousand six hundred forty seven', + 'two 1000000000 one 140 seven 1000483 thousand 640 seven', 'en', ['two','thousand','seven','one']), + + ] +) +def test_parse_including_ignore(expected, test_input, lang, ignore): + assert parse(test_input, lang, ignore) == expected + @pytest.mark.parametrize( "test_input,expected,lang", From 6e5b7aed46076333f85be2d45ddefdb1f42896f9 Mon Sep 17 00:00:00 2001 From: Dhananjay Date: Fri, 7 Jan 2022 03:59:41 +0530 Subject: [PATCH 3/5] few more test cases for checks --- tests/test_number_parsing.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/tests/test_number_parsing.py b/tests/test_number_parsing.py index e824c86..c2ea7fa 100644 --- a/tests/test_number_parsing.py +++ b/tests/test_number_parsing.py @@ -124,22 +124,25 @@ def test_parse_sentences_ordinal(expected, test_input, lang): @pytest.mark.parametrize( "test_input,expected,lang,ignore", [ - ('fifty fifth sixty seventh', 'fifty 5 67', 'en', ['fifty','seven']), + ('fifty fifth sixty seventh', "fifty 5 67", 'en', ['fifty','seven']), + ('hundredth and one', "100 and 1", 'en',[]), + ('one hundred and forty second', "140 second", 'en', ['second']), + ('five thousandth and one', "5000 and one", 'en', ['one']), # en - ('Two thousand sentences', '2 thousand sentences', 'en', ['thousand']), - ('twenty one', '20 one', 'en', ['one']), - ('I have three apples and one pear.', 'I have three apples and 1 pear.', 'en', ['three']), + ('Two thousand sentences', "2 thousand sentences", 'en', ['thousand']), + ('twenty one', "20 one", 'en', ['one']), + ('I have three apples and one pear.', "I have three apples and 1 pear.", 'en', ['three']), # numeric - ('eleven', 'eleven', 'en', ['eleven']), - ('one hundred and forty two', 'one 140 two', 'en', ['one','two']), + ('eleven', "eleven", 'en', ['eleven']), + ('one hundred and forty two', "one 140 two", 'en', ['one','two']), ('one hundred and one', 'one 100 one', 'en', ['one']), - ('ignore this sentence', 'ignore this sentence', 'en', ['ignore']), - ('five hundred sixty seven thousand twenty four', 'five 167020 four', 'en', ['fifty','five','four']), - ('one million four hundred twenty-three thousand nine hundred twenty-two', '1000400 twenty-3900 twenty-two', 'en', ['two','twenty']), - ('nine hundred ninety-nine thousand nine hundred ninety-nine', 'nine 190 nine 1000 nine 190 nine', 'en', ['nine']), - ('one million fifty thousand', '1000000 fifty 1000', 'en', ['fifty']), + ('ignore this sentence', "ignore this sentence", 'en', ['ignore']), + ('five hundred sixty seven thousand twenty four', "five 167020 four", 'en', ['fifty','five','four']), + ('one million four hundred twenty-three thousand nine hundred twenty-two', "1000400 twenty-3900 twenty-two", 'en', ['two','twenty']), + ('nine hundred ninety-nine thousand nine hundred ninety-nine', "nine 190 nine 1000 nine 190 nine", 'en', ['nine']), + ('one million fifty thousand', "1000000 fifty 1000", 'en', ['fifty']), ('two billion one hundred forty seven million four hundred eighty three thousand six hundred forty seven', - 'two 1000000000 one 140 seven 1000483 thousand 640 seven', 'en', ['two','thousand','seven','one']), + "two 1000000000 one 140 seven 1000483 thousand 640 seven", 'en', ['two','thousand','seven','one']), ] ) From 237d9df448af462059faed64fa8ba8184756ef84 Mon Sep 17 00:00:00 2001 From: Dhananjay Date: Sun, 9 Jan 2022 02:27:18 +0530 Subject: [PATCH 4/5] skip token test case coverage --- tests/test_number_parsing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_number_parsing.py b/tests/test_number_parsing.py index c2ea7fa..8013888 100644 --- a/tests/test_number_parsing.py +++ b/tests/test_number_parsing.py @@ -135,8 +135,8 @@ def test_parse_sentences_ordinal(expected, test_input, lang): # numeric ('eleven', "eleven", 'en', ['eleven']), ('one hundred and forty two', "one 140 two", 'en', ['one','two']), - ('one hundred and one', 'one 100 one', 'en', ['one']), - ('ignore this sentence', "ignore this sentence", 'en', ['ignore']), + ('one hundred and one', "one 100 one", 'en', ['one']), + ('seven thousand and nothing else',"seven 1000 and nothing else", 'en', ['seven']), ('five hundred sixty seven thousand twenty four', "five 167020 four", 'en', ['fifty','five','four']), ('one million four hundred twenty-three thousand nine hundred twenty-two', "1000400 twenty-3900 twenty-two", 'en', ['two','twenty']), ('nine hundred ninety-nine thousand nine hundred ninety-nine', "nine 190 nine 1000 nine 190 nine", 'en', ['nine']), From 4fd5e95dced7d4bcbf9418c4712a5c5eac2f82fd Mon Sep 17 00:00:00 2001 From: Dhananjay Date: Sun, 9 Jan 2022 02:55:33 +0530 Subject: [PATCH 5/5] Direct numbers and tens test coverage --- tests/test_number_parsing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_number_parsing.py b/tests/test_number_parsing.py index 8013888..965a3e7 100644 --- a/tests/test_number_parsing.py +++ b/tests/test_number_parsing.py @@ -134,6 +134,7 @@ def test_parse_sentences_ordinal(expected, test_input, lang): ('I have three apples and one pear.', "I have three apples and 1 pear.", 'en', ['three']), # numeric ('eleven', "eleven", 'en', ['eleven']), + ('ninety thirteen forty', "90 13 forty", 'en', ['forty']), ('one hundred and forty two', "one 140 two", 'en', ['one','two']), ('one hundred and one', "one 100 one", 'en', ['one']), ('seven thousand and nothing else',"seven 1000 and nothing else", 'en', ['seven']),