Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Support for numeral systems #1

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion number_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from number_parser.parser import parse, parse_number, parse_ordinal, parse_fraction, parse_roman
from number_parser.parser import parse, parse_number, parse_ordinal, parse_fraction, NUMERAL_SYSTEMS
78 changes: 50 additions & 28 deletions number_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
SENTENCE_SEPARATORS = [".", ","]
SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru']
RE_BUG_LANGUAGES = ['hi']
NUMERAL_SYSTEMS = ['decimal', 'roman']


class LanguageData:
Expand Down Expand Up @@ -250,6 +251,9 @@ def parse_number(input_string, language=None):
if input_string.strip().isnumeric():
return int(input_string)

if re.search("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", input_string.lower()):
return int(_parse_roman(input_string))

if language is None:
language = _valid_tokens_by_language(input_string)

Expand Down Expand Up @@ -299,40 +303,33 @@ def parse_fraction(input_string, language=None):
return None


def parse_roman(input_string):
tokens = _tokenize(input_string, None)

def build_roman(roman_number):
roman = {'i': 1, 'v': 5, 'x': 10, 'l': 50, 'c': 100, 'd': 500, 'm': 1000}
num_tokens = re.split("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", roman_number.lower())
num_tokens = [item for item in num_tokens if item != '']
built_num = 0
for num_token in num_tokens:
if re.search('iv|ix|xl|xc|cd|cm', num_token):
built_num += roman[num_token[1]] - roman[num_token[0]]
elif re.search('[vld][ixc]{1,3}', num_token):
built_num += roman[num_token[0]] + (roman[num_token[1]] * (len(num_token) - 1))
else:
built_num += roman[num_token[0]] * len(num_token)

return built_num

for token in tokens:
if re.search("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", token.lower()):
tokens[tokens.index(token)] = str(build_roman(token))
final_sentence = ''.join(tokens)

return final_sentence


def parse(input_string, language=None):
def parse(input_string, language=None, numeral_systems=None):
"""
Converts all the numbers in a sentence written in natural language to their numeric type while keeping
the other words unchanged. Returns the transformed string.
"""
global complete_sentence

if numeral_systems is None:
numeral_systems = ['decimal', 'roman']

if language is None:
language = _valid_tokens_by_language(input_string)

for numeral_system in numeral_systems:

if numeral_system == 'decimal':
complete_sentence = _parse_decimal(input_string, language)
input_string = complete_sentence

if numeral_system == 'roman':
complete_sentence = _parse_roman(input_string)
input_string = complete_sentence

return complete_sentence


def _parse_decimal(input_string, language):
lang_data = LanguageData(language)

tokens = _tokenize(input_string, language)
Expand Down Expand Up @@ -386,8 +383,33 @@ def _build_and_add_number(pop_last_space=False):

_build_and_add_number()
current_sentence.append(token)

_build_and_add_number()

final_sentence.extend(current_sentence)
return ''.join(final_sentence).strip()


def _parse_roman(input_string):
tokens = _tokenize(input_string, None)
for token in tokens:
if re.search("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", token.lower()):
if _build_roman(token) != 0:
tokens[tokens.index(token)] = str(_build_roman(token))
final_sentence = ''.join(tokens)

return final_sentence


def _build_roman(roman_number):
roman = {'i': 1, 'v': 5, 'x': 10, 'l': 50, 'c': 100, 'd': 500, 'm': 1000}
num_tokens = re.split("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", roman_number.lower())
num_tokens = [item for item in num_tokens if item != '']
built_num = 0
for num_token in num_tokens:
if re.search('iv|ix|xl|xc|cd|cm', num_token):
built_num += roman[num_token[1]] - roman[num_token[0]]
elif re.search('[vld][ixc]{1,3}', num_token):
built_num += roman[num_token[0]] + (roman[num_token[1]] * (len(num_token) - 1))
else:
built_num += roman[num_token[0]] * len(num_token)
return built_num
3 changes: 3 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from number_parser import parse ,parse_number

print(parse_number('built in CDXX', numeral_system='roman'))
2 changes: 1 addition & 1 deletion tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def get_test_files(path, prefix):
def _test_files(path, language, is_ordinal=True):
fnx = parse_ordinal if is_ordinal else parse_number
for filename in get_test_files(path, f'{language}_'):
with open(filename, "r") as csv_file:
with open(filename, "r", encoding="utf8") as csv_file:
csv_reader = csv.DictReader(csv_file)
for row in csv_reader:
try:
Expand Down
Loading