diff --git a/src/unihan_etl/expansion.py b/src/unihan_etl/expansion.py index aca312b2..b49caeca 100644 --- a/src/unihan_etl/expansion.py +++ b/src/unihan_etl/expansion.py @@ -9,6 +9,7 @@ 3. the last used compiled regexes are cached """ +import enum import re import typing as t @@ -845,12 +846,61 @@ def expand_kSBGY(value: t.List[str]) -> t.List[kSBGYDict]: return expanded +class kRSSimplifiedType(enum.Enum): + r"""Whether ideograph is a simplified form of a radical. + + "The radical is indicated by a number in the range 1-214, followed by an optional + single apostrophe (U+0027 ' APOSTROPHE) or, double apostrophe (''), or triple + apostrophe (''') suffix. A single apostrophe after the radical indicates a Chinese + simplified version of the given radical. Two apostrophes after the radical + indicates a non-Chinese simplified version of the given radical. Three apostrophes + after the radical indicates a second non-Chinese simplified version of the given + radical." Source: https://www.unicode.org/reports/tr38/tr38-36.html#kRSUnicode + """ + + Chinese = "Chinese" + NonChinese = "NonChinese" + SecondNonChinese = "SecondNonChinese" + + class kRSGenericDict(t.TypedDict): """kRSGeneric mapping.""" radical: int strokes: int - simplified: bool + simplified: t.Union[kRSSimplifiedType, t.Literal[False]] + + +def get_krs_simplified_type(val: str) -> t.Union[kRSSimplifiedType, t.Literal[False]]: + """Detect type of simplified radical, if one at all. + + Examples + -------- + >>> get_krs_simplified_type('') + False + + >>> get_krs_simplified_type("'") + + + >>> get_krs_simplified_type("''") + + + >>> get_krs_simplified_type("'''") + + """ + #: Chinese simplified version of the given radical. + if val == "'": + return kRSSimplifiedType.Chinese + + #: Non-Chinese simplified version of the given radical. + if val == "''": + return kRSSimplifiedType.NonChinese + + #: Second non-Chinese simplified version of the given radical. + if val == "'''": + return kRSSimplifiedType.SecondNonChinese + + return False def _expand_kRSGeneric(value: t.List[str]) -> t.List[kRSGenericDict]: @@ -860,12 +910,17 @@ def _expand_kRSGeneric(value: t.List[str]) -> t.List[kRSGenericDict]: -------- >>> _expand_kRSGeneric(['5.10', "213''.0"]) # doctest: +NORMALIZE_WHITESPACE [{'radical': 5, 'strokes': 10, 'simplified': False}, - {'radical': 213, 'strokes': 0, 'simplified': False}] + {'radical': 213, 'strokes': 0, 'simplified': + }] + + >>> _expand_kRSGeneric(["120'.3"]) # doctest: +NORMALIZE_WHITESPACE + [{'radical': 120, 'strokes': 3, 'simplified': + }] """ pattern = re.compile( r""" (?P[1-9][0-9]{0,2}) - (?P\'{0,2})\. + (?P\'{0,3})\. (?P-?[0-9]{1,2}) """, re.VERBOSE, @@ -881,7 +936,7 @@ def _expand_kRSGeneric(value: t.List[str]) -> t.List[kRSGenericDict]: expanded[i] = kRSGenericDict( radical=int(g["radical"]), strokes=int(g["strokes"]), - simplified=g["simplified"] == "'", + simplified=get_krs_simplified_type(g["simplified"]), ) return expanded diff --git a/tests/test_expansion.py b/tests/test_expansion.py index d0d55260..24d5b3de 100644 --- a/tests/test_expansion.py +++ b/tests/test_expansion.py @@ -373,9 +373,59 @@ def test_expand_kRSAdobe_Japan1_6( ("ucn", "expected"), [ # U+3491 kRSUnicode 9.13 - ("U+3491", [{"radical": 9, "strokes": 13, "simplified": False}]), + ( + "U+3491", + [ + { + "radical": 9, + "strokes": 13, + "simplified": False, + } + ], + ), # U+4336 kRSUnicode 120'.3 - ("U+4336", [{"radical": 120, "strokes": 3, "simplified": True}]), + ( + "U+4336", + [ + { + "radical": 120, + "strokes": 3, + "simplified": expansion.kRSSimplifiedType.Chinese, + } + ], + ), + # U+2CC7B kRSUnicode 182''.5 117.4 + ( + "U+2CC7B", + [ + { + "radical": 182, + "strokes": 5, + "simplified": expansion.kRSSimplifiedType.NonChinese, + }, + { + "radical": 117, + "strokes": 4, + "simplified": False, + }, + ], + ), + # U+31E22 kRSUnicode 118.11 212'''.6 + ( + "U+31E22", + [ + { + "radical": 118, + "strokes": 11, + "simplified": False, + }, + { + "radical": 212, + "strokes": 6, + "simplified": expansion.kRSSimplifiedType.SecondNonChinese, + }, + ], + ), ], ) def test_expand_kRSUnihan(