Skip to content

Commit

Permalink
expansion: Update kRSUnicode for apostrophes
Browse files Browse the repository at this point in the history
  • Loading branch information
tony committed Nov 25, 2024
1 parent 7c01d41 commit 71ec136
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 6 deletions.
63 changes: 59 additions & 4 deletions src/unihan_etl/expansion.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
3. the last used compiled regexes are cached
"""

import enum
import re
import typing as t

Expand Down Expand Up @@ -845,12 +846,61 @@ def expand_kSBGY(value: t.List[str]) -> t.List[kSBGYDict]:
return expanded


class kRSSimplifiedType(enum.Enum):
r"""Whether ideograph is a simplified form of a radical.
"The radical is indicated by a number in the range 1-214, followed by an optional
single apostrophe (U+0027 ' APOSTROPHE) or, double apostrophe (''), or triple
apostrophe (''') suffix. A single apostrophe after the radical indicates a Chinese
simplified version of the given radical. Two apostrophes after the radical
indicates a non-Chinese simplified version of the given radical. Three apostrophes
after the radical indicates a second non-Chinese simplified version of the given
radical." Source: https://www.unicode.org/reports/tr38/tr38-36.html#kRSUnicode
"""

Chinese = "Chinese"
NonChinese = "NonChinese"
SecondNonChinese = "SecondNonChinese"


class kRSGenericDict(t.TypedDict):
"""kRSGeneric mapping."""

radical: int
strokes: int
simplified: bool
simplified: t.Union[kRSSimplifiedType, t.Literal[False]]


def get_krs_simplified_type(val: str) -> t.Union[kRSSimplifiedType, t.Literal[False]]:
"""Detect type of simplified radical, if one at all.
Examples
--------
>>> get_krs_simplified_type('')
False
>>> get_krs_simplified_type("'")
<kRSSimplifiedType.Chinese: 'Chinese'>
>>> get_krs_simplified_type("''")
<kRSSimplifiedType.NonChinese: 'NonChinese'>
>>> get_krs_simplified_type("'''")
<kRSSimplifiedType.SecondNonChinese: 'SecondNonChinese'>
"""
#: Chinese simplified version of the given radical.
if val == "'":
return kRSSimplifiedType.Chinese

#: Non-Chinese simplified version of the given radical.
if val == "''":
return kRSSimplifiedType.NonChinese

#: Second non-Chinese simplified version of the given radical.
if val == "'''":
return kRSSimplifiedType.SecondNonChinese

return False


def _expand_kRSGeneric(value: t.List[str]) -> t.List[kRSGenericDict]:
Expand All @@ -860,12 +910,17 @@ def _expand_kRSGeneric(value: t.List[str]) -> t.List[kRSGenericDict]:
--------
>>> _expand_kRSGeneric(['5.10', "213''.0"]) # doctest: +NORMALIZE_WHITESPACE
[{'radical': 5, 'strokes': 10, 'simplified': False},
{'radical': 213, 'strokes': 0, 'simplified': False}]
{'radical': 213, 'strokes': 0, 'simplified':
<kRSSimplifiedType.NonChinese: 'NonChinese'>}]
>>> _expand_kRSGeneric(["120'.3"]) # doctest: +NORMALIZE_WHITESPACE
[{'radical': 120, 'strokes': 3, 'simplified':
<kRSSimplifiedType.Chinese: 'Chinese'>}]
"""
pattern = re.compile(
r"""
(?P<radical>[1-9][0-9]{0,2})
(?P<simplified>\'{0,2})\.
(?P<simplified>\'{0,3})\.
(?P<strokes>-?[0-9]{1,2})
""",
re.VERBOSE,
Expand All @@ -881,7 +936,7 @@ def _expand_kRSGeneric(value: t.List[str]) -> t.List[kRSGenericDict]:
expanded[i] = kRSGenericDict(
radical=int(g["radical"]),
strokes=int(g["strokes"]),
simplified=g["simplified"] == "'",
simplified=get_krs_simplified_type(g["simplified"]),
)
return expanded

Expand Down
54 changes: 52 additions & 2 deletions tests/test_expansion.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,9 +373,59 @@ def test_expand_kRSAdobe_Japan1_6(
("ucn", "expected"),
[
# U+3491 kRSUnicode 9.13
("U+3491", [{"radical": 9, "strokes": 13, "simplified": False}]),
(
"U+3491",
[
{
"radical": 9,
"strokes": 13,
"simplified": False,
}
],
),
# U+4336 kRSUnicode 120'.3
("U+4336", [{"radical": 120, "strokes": 3, "simplified": True}]),
(
"U+4336",
[
{
"radical": 120,
"strokes": 3,
"simplified": expansion.kRSSimplifiedType.Chinese,
}
],
),
# U+2CC7B kRSUnicode 182''.5 117.4
(
"U+2CC7B",
[
{
"radical": 182,
"strokes": 5,
"simplified": expansion.kRSSimplifiedType.NonChinese,
},
{
"radical": 117,
"strokes": 4,
"simplified": False,
},
],
),
# U+31E22 kRSUnicode 118.11 212'''.6
(
"U+31E22",
[
{
"radical": 118,
"strokes": 11,
"simplified": False,
},
{
"radical": 212,
"strokes": 6,
"simplified": expansion.kRSSimplifiedType.SecondNonChinese,
},
],
),
],
)
def test_expand_kRSUnihan(
Expand Down

0 comments on commit 71ec136

Please sign in to comment.