Skip to content

Commit 866f585

Browse files
committed
add tests for rbnf engine
1 parent 6896cf6 commit 866f585

File tree

3 files changed

+68
-43
lines changed

3 files changed

+68
-43
lines changed

babel/rbnf.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -759,3 +759,28 @@ def return_value_by_type(self, typ: int):
759759
REMAINDER_TOKEN: self.REMAINDER,
760760
SUBSTITUTION_TOKEN: self.SUBSTITUTION,
761761
}[typ]
762+
763+
764+
def parse_rbnf_rules(data, tree):
765+
"""
766+
Parse rules based on:
767+
http://www.unicode.org/reports/tr35/tr35-47/tr35-numbers.html#Rule-Based_Number_Formatting
768+
"""
769+
rbnf_rules = data.setdefault('rbnf_rules', {})
770+
771+
# ElementTree.dump(tree)
772+
773+
for ruleset_grouping in tree.findall('.//rbnf/rulesetGrouping'):
774+
group_name = ruleset_grouping.attrib['type']
775+
rbnf_rules[group_name] = [] # TODO check for overwrite
776+
for ruleset in ruleset_grouping.findall('ruleset'):
777+
ruleset_name = ruleset.attrib['type']
778+
private = ruleset.attrib.get('access') == 'private'
779+
ruleset_obj = Ruleset(ruleset_name, private)
780+
for rule in ruleset.findall('rbnfrule'):
781+
radix = rule.attrib.get('radix')
782+
if radix == "1,000": # HACK: work around misspelled radix in mt.xml
783+
radix = "1000"
784+
rule_obj = Rule(rule.attrib['value'], rule.text, radix)
785+
ruleset_obj.rules.append(rule_obj)
786+
rbnf_rules[group_name].append(ruleset_obj)

scripts/import_cldr.py

Lines changed: 4 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,10 @@ def _process_local_datas(sup, srcdir, destdir, force=False, dump_json=False):
472472
rbnf_filename = os.path.join(srcdir, 'rbnf', filename)
473473
if os.path.isfile(rbnf_filename):
474474
rbnf_tree = parse(rbnf_filename)
475-
parse_rbnf_rules(data, rbnf_tree)
475+
try:
476+
rbnf.parse_rbnf_rules(data, rbnf_tree)
477+
except rbnf.RBNFError as e:
478+
log(f"{data['locale_id']}: Unable to parse rule: {e}")
476479

477480
write_datafile(data_filename, data, dump_json=dump_json)
478481

@@ -1054,38 +1057,5 @@ def parse_measurement_systems(data, tree):
10541057
_import_type_text(measurement_systems, measurement_system, type=type)
10551058

10561059

1057-
def parse_rbnf_rules(data, tree):
1058-
"""
1059-
Parse rules based on:
1060-
http://www.unicode.org/reports/tr35/tr35-47/tr35-numbers.html#Rule-Based_Number_Formatting
1061-
"""
1062-
rbnf_rules = data.setdefault('rbnf_rules', {})
1063-
1064-
# ElementTree.dump(tree)
1065-
1066-
for ruleset_grouping in tree.findall('.//rbnf/rulesetGrouping'):
1067-
group_name = ruleset_grouping.attrib['type']
1068-
rbnf_rules[group_name] = [] # TODO check for overwrite
1069-
for ruleset in ruleset_grouping.findall('ruleset'):
1070-
ruleset_name = ruleset.attrib['type']
1071-
private = ruleset.attrib.get('access') == 'private'
1072-
ruleset_obj = rbnf.Ruleset(ruleset_name, private)
1073-
for rule in ruleset.findall('rbnfrule'):
1074-
radix = rule.attrib.get('radix')
1075-
if radix == "1,000": # HACK: work around misspelled radix in mt.xml
1076-
radix = "1000"
1077-
try:
1078-
rule_obj = rbnf.Rule(rule.attrib['value'], rule.text, radix)
1079-
ruleset_obj.rules.append(rule_obj)
1080-
except rbnf.TokenizationError:
1081-
log('%s: Unable to parse rule "%s%s: %s "' % (
1082-
data['locale_id'],
1083-
rule.attrib['value'],
1084-
rule.text,
1085-
'' if radix is None else ('/%s' % radix),
1086-
))
1087-
rbnf_rules[group_name].append(ruleset_obj)
1088-
1089-
10901060
if __name__ == '__main__':
10911061
main()

tests/test_number_spelling.py

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
import sys
23

34
import pytest
@@ -17,14 +18,20 @@ def test_basic():
1718
def test_negotiation():
1819
for lid in locale_identifiers():
1920
try:
20-
loc = rbnf.RuleBasedNumberFormat.negotiate(lid)._locale
21+
speller = rbnf.RuleBasedNumberFormat.negotiate(lid)._locale
2122
except rbnf.RulesetNotFound:
2223
# generate warning if necessary
2324
continue
2425
# test groups
25-
for k in loc._data['rbnf_rules']:
26+
for k in speller._data['rbnf_rules']:
2627
assert k in rbnf.RuleBasedNumberFormat.group_types
2728

29+
speller = rbnf.RuleBasedNumberFormat.negotiate("en")
30+
speller.match_ruleset("numbering")
31+
32+
with pytest.raises(rbnf.RulesetNotFound):
33+
speller.match_ruleset("nonexistent")
34+
2835

2936
def test_tokenization():
3037
x = list(rbnf.tokenize("text[opt];"))
@@ -34,18 +41,41 @@ def test_tokenization():
3441
]
3542
assert x == res
3643

44+
rbnf.tokenize("→→→;") # should not raise
45+
46+
with pytest.raises(ValueError, match=r"Unable to.*"):
47+
list(rbnf.tokenize("==="))
48+
49+
with pytest.warns(SyntaxWarning, match=r"Reference parsing error.*"):
50+
list(rbnf.tokenize("←bad←;"))
51+
3752

3853
def test_xml_parsing():
3954
"""
40-
all the rules should be able to go through the parser and tokenizer
41-
made up some rules and run the tokenizer on them
55+
All the rues implicitly go through the arsing during CLDR import.
4256
43-
TODO
44-
read data from all the locales that have rbnf_rules defined
45-
all the raw rules should be in a specific structure based
46-
on the XML specification
57+
This tests replicates the parsing for the English locale to
58+
add coverage to the parsing parts of the code.
4759
"""
48-
assert True
60+
from xml.etree import ElementTree
61+
62+
test_locale = 'en'
63+
64+
rules = numbers.get_rbnf_rules(test_locale)
65+
66+
assert rules
67+
68+
test_file = f"cldr/cldr-common-47.0/common/rbnf/{test_locale}.xml"
69+
70+
assert os.path.isfile(test_file)
71+
72+
data = {}
73+
74+
rbnf_tree = ElementTree.parse(test_file)
75+
rbnf.parse_rbnf_rules(data, rbnf_tree)
76+
77+
assert 'rbnf_rules' in data
78+
assert 'SpelloutRules' in data['rbnf_rules']
4979

5080

5181
def test_compute_divisor():

0 commit comments

Comments
 (0)