-
Notifications
You must be signed in to change notification settings - Fork 23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
I have implemented parse_roman() function #64
base: master
Are you sure you want to change the base?
Changes from 17 commits
61f57d7
6664b6f
376db40
ce4e5c6
6d420a7
bf7d6e3
5104b76
94d9441
475ad04
f847cf6
45bfdc5
c13102f
973664b
0e89680
9704eff
554d553
77ae66f
fb9bff7
480cf7c
faecf42
cbc4661
086f5ec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
from number_parser.parser import parse, parse_number, parse_ordinal, parse_fraction | ||
from number_parser.parser import parse, parse_number, parse_ordinal, parse_fraction, NUMERAL_SYSTEMS |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,9 +1,12 @@ | ||||||
import re | ||||||
from importlib import import_module | ||||||
import unicodedata | ||||||
|
||||||
SENTENCE_SEPARATORS = [".", ","] | ||||||
SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru'] | ||||||
RE_BUG_LANGUAGES = ['hi'] | ||||||
NUMERAL_SYSTEMS = ('decimal', 'roman') | ||||||
ROMAN_REGEX_EXPRESSION = "(?i)^(m{0,3})(cm|cd|d?c{0,4})(xc|xl|l?x{0,4})(ix|iv|v?i{0,4})$" | ||||||
|
||||||
|
||||||
class LanguageData: | ||||||
|
@@ -241,7 +244,11 @@ def parse_ordinal(input_string, language=None): | |||||
return parse_number(output_string, language) | ||||||
|
||||||
|
||||||
def parse_number(input_string, language=None): | ||||||
def _search_roman(search_string): | ||||||
return re.search(ROMAN_REGEX_EXPRESSION, search_string, re.IGNORECASE) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
||||||
|
||||||
def parse_number(input_string, language=None, numeral_systems=None): | ||||||
"""Converts a single number written in natural language to a numeric type""" | ||||||
if not input_string.strip(): | ||||||
return None | ||||||
|
@@ -252,20 +259,37 @@ def parse_number(input_string, language=None): | |||||
if language is None: | ||||||
language = _valid_tokens_by_language(input_string) | ||||||
|
||||||
lang_data = LanguageData(language) | ||||||
if numeral_systems is None: | ||||||
numeral_systems = NUMERAL_SYSTEMS | ||||||
|
||||||
if _search_roman(input_string): | ||||||
numeral_systems = ['roman'] | ||||||
Gallaecio marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
else: | ||||||
numeral_systems = ['decimal'] | ||||||
|
||||||
for numeral_system in numeral_systems: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will fail if |
||||||
if numeral_system == 'decimal': | ||||||
lang_data = LanguageData(language) | ||||||
|
||||||
tokens = _tokenize(input_string, language) | ||||||
normalized_tokens = _normalize_tokens(tokens) | ||||||
for index, token in enumerate(normalized_tokens): | ||||||
if _is_cardinal_token(token, lang_data) or not token.strip(): | ||||||
continue | ||||||
if _is_skip_token(token, lang_data) and index != 0: | ||||||
continue | ||||||
return None | ||||||
number_built = _build_number(normalized_tokens, lang_data) | ||||||
if len(number_built) == 1: | ||||||
return int(number_built[0]) | ||||||
return None | ||||||
Comment on lines
+280
to
+284
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Those Maybe you could move this code into a |
||||||
|
||||||
tokens = _tokenize(input_string, language) | ||||||
normalized_tokens = _normalize_tokens(tokens) | ||||||
for index, token in enumerate(normalized_tokens): | ||||||
if _is_cardinal_token(token, lang_data) or not token.strip(): | ||||||
continue | ||||||
if _is_skip_token(token, lang_data) and index != 0: | ||||||
continue | ||||||
return None | ||||||
number_built = _build_number(normalized_tokens, lang_data) | ||||||
if len(number_built) == 1: | ||||||
return int(number_built[0]) | ||||||
return None | ||||||
elif numeral_system == 'roman': | ||||||
return int(_parse_roman(input_string)) | ||||||
|
||||||
else: | ||||||
raise ValueError(f'"{numeral_system}" is not a supported numeral system') | ||||||
AmPhIbIaN26 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
|
||||||
def parse_fraction(input_string, language=None): | ||||||
|
@@ -298,14 +322,37 @@ def parse_fraction(input_string, language=None): | |||||
return None | ||||||
|
||||||
|
||||||
def parse(input_string, language=None): | ||||||
def parse(input_string, language=None, numeral_systems=None): | ||||||
""" | ||||||
Converts all the numbers in a sentence written in natural language to their numeric type while keeping | ||||||
the other words unchanged. Returns the transformed string. | ||||||
""" | ||||||
complete_sentence = None | ||||||
|
||||||
if numeral_systems is None: | ||||||
numeral_systems = NUMERAL_SYSTEMS | ||||||
|
||||||
if language is None: | ||||||
language = _valid_tokens_by_language(input_string) | ||||||
|
||||||
temporary_sentence = input_string | ||||||
for numeral_system in numeral_systems: | ||||||
|
||||||
if numeral_system == 'decimal': | ||||||
complete_sentence = _parse_decimal(temporary_sentence, language) | ||||||
temporary_sentence = complete_sentence | ||||||
|
||||||
elif numeral_system == 'roman': | ||||||
complete_sentence = _parse_roman(temporary_sentence) | ||||||
temporary_sentence = complete_sentence | ||||||
|
||||||
else: | ||||||
raise ValueError(f'"{numeral_system}" is not a supported numeral system') | ||||||
|
||||||
return complete_sentence | ||||||
AmPhIbIaN26 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
|
||||||
def _parse_decimal(input_string, language): | ||||||
lang_data = LanguageData(language) | ||||||
|
||||||
tokens = _tokenize(input_string, language) | ||||||
|
@@ -359,8 +406,40 @@ def _build_and_add_number(pop_last_space=False): | |||||
|
||||||
_build_and_add_number() | ||||||
current_sentence.append(token) | ||||||
|
||||||
_build_and_add_number() | ||||||
|
||||||
final_sentence.extend(current_sentence) | ||||||
return ''.join(final_sentence).strip() | ||||||
|
||||||
|
||||||
def _parse_roman(input_string): | ||||||
tokens = _tokenize(input_string, None) | ||||||
tokens = [item for item in tokens if item != ''] | ||||||
for token in tokens: | ||||||
if _search_roman(token): | ||||||
tokens[tokens.index(token)] = str(_build_roman(token)) | ||||||
final_sentence = ''.join(tokens) | ||||||
|
||||||
return final_sentence | ||||||
|
||||||
|
||||||
def _build_roman(roman_number): | ||||||
roman = {'i': 1, 'v': 5, 'x': 10, 'l': 50, 'c': 100, 'd': 500, 'm': 1000} | ||||||
|
||||||
num_tokens = re.split(ROMAN_REGEX_EXPRESSION, roman_number, re.IGNORECASE) | ||||||
num_tokens = [item for item in num_tokens if item != ''] | ||||||
|
||||||
built_num = 0 | ||||||
|
||||||
for num_token in num_tokens: | ||||||
|
||||||
if re.search('iv|ix|xl|xc|cd|cm', num_token, re.IGNORECASE): | ||||||
built_num += roman[num_token[1].lower()] - roman[num_token[0].lower()] | ||||||
|
||||||
elif re.search('[XLVD][IXC]{1,4}', num_token, re.IGNORECASE): | ||||||
built_num += roman[num_token[0].lower()] + (roman[num_token[1].lower()] * (len(num_token) - 1)) | ||||||
|
||||||
else: | ||||||
built_num += roman[num_token[0].lower()] * len(num_token) | ||||||
|
||||||
return built_num |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let’s make this a private constant, so that we can freely rename it or move it in the future if we wish without breaking the API: