add tests for rbnf engine

blagasz · blagasz · commit 866f58533365 · 2025-09-28T23:16:31.000+02:00
diff --git a/babel/rbnf.py b/babel/rbnf.py
@@ -759,3 +759,28 @@ def return_value_by_type(self, typ: int):
             REMAINDER_TOKEN: self.REMAINDER,
             SUBSTITUTION_TOKEN: self.SUBSTITUTION,
         }[typ]
+
+
+def parse_rbnf_rules(data, tree):
+    """
+    Parse rules based on:
+    http://www.unicode.org/reports/tr35/tr35-47/tr35-numbers.html#Rule-Based_Number_Formatting
+    """
+    rbnf_rules = data.setdefault('rbnf_rules', {})
+
+    # ElementTree.dump(tree)
+
+    for ruleset_grouping in tree.findall('.//rbnf/rulesetGrouping'):
+        group_name = ruleset_grouping.attrib['type']
+        rbnf_rules[group_name] = []  # TODO check for overwrite
+        for ruleset in ruleset_grouping.findall('ruleset'):
+            ruleset_name = ruleset.attrib['type']
+            private = ruleset.attrib.get('access') == 'private'
+            ruleset_obj = Ruleset(ruleset_name, private)
+            for rule in ruleset.findall('rbnfrule'):
+                radix = rule.attrib.get('radix')
+                if radix == "1,000":  # HACK: work around misspelled radix in mt.xml
+                    radix = "1000"
+                rule_obj = Rule(rule.attrib['value'], rule.text, radix)
+                ruleset_obj.rules.append(rule_obj)
+            rbnf_rules[group_name].append(ruleset_obj)
diff --git a/scripts/import_cldr.py b/scripts/import_cldr.py
@@ -472,7 +472,10 @@ def _process_local_datas(sup, srcdir, destdir, force=False, dump_json=False):
         rbnf_filename = os.path.join(srcdir, 'rbnf', filename)
         if os.path.isfile(rbnf_filename):
             rbnf_tree = parse(rbnf_filename)
-            parse_rbnf_rules(data, rbnf_tree)
+            try:
+                rbnf.parse_rbnf_rules(data, rbnf_tree)
+            except rbnf.RBNFError as e:
+                log(f"{data['locale_id']}: Unable to parse rule: {e}")
 
         write_datafile(data_filename, data, dump_json=dump_json)
 
@@ -1054,38 +1057,5 @@ def parse_measurement_systems(data, tree):
             _import_type_text(measurement_systems, measurement_system, type=type)
 
 
-def parse_rbnf_rules(data, tree):
-    """
-    Parse rules based on:
-    http://www.unicode.org/reports/tr35/tr35-47/tr35-numbers.html#Rule-Based_Number_Formatting
-    """
-    rbnf_rules = data.setdefault('rbnf_rules', {})
-
-    # ElementTree.dump(tree)
-
-    for ruleset_grouping in tree.findall('.//rbnf/rulesetGrouping'):
-        group_name = ruleset_grouping.attrib['type']
-        rbnf_rules[group_name] = []  # TODO check for overwrite
-        for ruleset in ruleset_grouping.findall('ruleset'):
-            ruleset_name = ruleset.attrib['type']
-            private = ruleset.attrib.get('access') == 'private'
-            ruleset_obj = rbnf.Ruleset(ruleset_name, private)
-            for rule in ruleset.findall('rbnfrule'):
-                radix = rule.attrib.get('radix')
-                if radix == "1,000":  # HACK: work around misspelled radix in mt.xml
-                    radix = "1000"
-                try:
-                    rule_obj = rbnf.Rule(rule.attrib['value'], rule.text, radix)
-                    ruleset_obj.rules.append(rule_obj)
-                except rbnf.TokenizationError:
-                    log('%s: Unable to parse rule "%s%s: %s "' % (
-                        data['locale_id'],
-                        rule.attrib['value'],
-                        rule.text,
-                        '' if radix is None else ('/%s' % radix),
-                    ))
-            rbnf_rules[group_name].append(ruleset_obj)
-
-
 if __name__ == '__main__':
     main()
diff --git a/tests/test_number_spelling.py b/tests/test_number_spelling.py
@@ -1,3 +1,4 @@
+import os
 import sys
 
 import pytest
@@ -17,14 +18,20 @@ def test_basic():
 def test_negotiation():
     for lid in locale_identifiers():
         try:
-            loc = rbnf.RuleBasedNumberFormat.negotiate(lid)._locale
+            speller = rbnf.RuleBasedNumberFormat.negotiate(lid)._locale
         except rbnf.RulesetNotFound:
             # generate warning if necessary
             continue
         # test groups
-        for k in loc._data['rbnf_rules']:
+        for k in speller._data['rbnf_rules']:
             assert k in rbnf.RuleBasedNumberFormat.group_types
 
+    speller = rbnf.RuleBasedNumberFormat.negotiate("en")
+    speller.match_ruleset("numbering")
+
+    with pytest.raises(rbnf.RulesetNotFound):
+        speller.match_ruleset("nonexistent")
+
 
 def test_tokenization():
     x = list(rbnf.tokenize("text[opt];"))
@@ -34,18 +41,41 @@ def test_tokenization():
     ]
     assert x == res
 
+    rbnf.tokenize("→→→;")  # should not raise
+
+    with pytest.raises(ValueError, match=r"Unable to.*"):
+        list(rbnf.tokenize("==="))
+
+    with pytest.warns(SyntaxWarning, match=r"Reference parsing error.*"):
+        list(rbnf.tokenize("←bad←;"))
+
 
 def test_xml_parsing():
     """
-    all the rules should be able to go through the parser and tokenizer
-    made up some rules and run the tokenizer on them
+    All the rues implicitly go through the arsing during CLDR import.
 
-    TODO
-    read data from all the locales that have rbnf_rules defined
-    all the raw rules should be in a specific structure based
-    on the XML specification
+    This tests replicates the parsing for the English locale to
+    add coverage to the parsing parts of the code.
     """
-    assert True
+    from xml.etree import ElementTree
+
+    test_locale = 'en'
+
+    rules = numbers.get_rbnf_rules(test_locale)
+
+    assert rules
+
+    test_file = f"cldr/cldr-common-47.0/common/rbnf/{test_locale}.xml"
+
+    assert os.path.isfile(test_file)
+
+    data = {}
+
+    rbnf_tree = ElementTree.parse(test_file)
+    rbnf.parse_rbnf_rules(data, rbnf_tree)
+
+    assert 'rbnf_rules' in data
+    assert 'SpelloutRules' in data['rbnf_rules']
 
 
 def test_compute_divisor():