-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathgen_dict_with_shape.py
156 lines (138 loc) · 5.06 KB
/
gen_dict_with_shape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/local/bin/python3
# -*- coding: utf-8 -*-
r"""
Convert quanpin dictionary files to pinyin+shape for Rime input method.
"""
import csv
import re
import argparse
from typing import List, Callable, Dict
import opencc
opencc_t2s = opencc.OpenCC('t2s.json')
def lunapy2flypy(pinyin: str) -> str:
r""" 全拼拼音转为小鹤双拼码, 如果转自然码等请自行替换双拼映射
adapted from: https://github.com/boomker/rime-flypy-xhfast/blob/15664c597644bd41410ec4595cece88a6452a1bf/scripts/flypy_dict_generator_new.py
"""
shengmu_dict = {"zh": "v", "ch": "i", "sh": "u"}
yunmu_dict = {
"ou": "z",
"iao": "n",
"uang": "l",
"iang": "l",
"en": "f",
"eng": "g",
"ang": "h",
"an": "j",
"ao": "c",
"ai": "d",
"ian": "m",
"in": "b",
"uo": "o",
"un": "y",
"iu": "q",
"uan": "r",
"van": "r",
"iong": "s",
"ong": "s",
"ue": "t",
"ve": "t",
"ui": "v",
"ua": "x",
"ia": "x",
"ie": "p",
"uai": "k",
"ing": "k",
"ei": "w",
}
zero = {
"a": "aa",
"an": "an",
"ai": "ai",
"ang": "ah",
"o": "oo",
"ou": "ou",
"e": "ee",
"n": "en",
"en": "en",
"eng": "eg",
}
if pinyin in zero:
return zero[pinyin]
if pinyin[1] == "h" and len(pinyin) > 2:
shengmu, yunmu = pinyin[:2], pinyin[2:]
shengmu = shengmu_dict[shengmu]
else:
shengmu, yunmu = pinyin[:1], pinyin[1:]
return shengmu + yunmu_dict.get(yunmu, yunmu)
def get_pinyin_fn(schema: str) -> Callable[[str], str]:
schema = schema.lower()
if schema in "quanpin lunapy luna_pinyin none".split():
def do_nothing(pinyin: str):
return pinyin
return do_nothing
if schema in "flypy xh xhup".split():
return lunapy2flypy
if schema in "zrm zrup".split():
raise NotImplementedError("Pinyin schema 'zrm' not implemented.")
def get_shape_dict(schema: str) -> Dict[str, str]:
with open(f"{schema}.txt", newline="", encoding='UTF-8') as f:
rows = list(csv.reader(f, delimiter="\t", quotechar="`"))
shape_dict = {row[0]: row[1] for row in rows if len(row) >= 2}
return shape_dict
def rewrite_row(row: List[str], code_fn: Callable[[str, str], str]) -> List[str]:
if len(row) < 2 or row[0][0] == "#":
return row
if len(row) == 2 and row[1][0].isnumeric(): # ['三觭龍', '1']
return row
# row == ['三觭龍', 'san ji long'] or ['三觭龍', 'san ji long', '1']
zh_chars = row[0]
# eg. '安娜·卡列尼娜' -> '安娜卡列尼娜'
zh_chars = re.sub("[;·,。;:“”‘’《》()!?、…—–]", "", zh_chars)
zh_chars = opencc_t2s.convert(zh_chars) # '三觭龍' -> '三觭龙'
pinyin_list = row[1].split() # ['san', 'ji', 'long']
if len(zh_chars) != len(pinyin_list): # failure case
print(row)
row[0] = "#" + row[0]
return row
# ['sj[hh', 'ji[[', 'ls[yp']
code_list = [code_fn(py, zi) for (py, zi) in zip(pinyin_list, zh_chars)]
row[1] = " ".join(code_list) # 'sj[hh ji[[ ls[yp'
return row
def get_cli_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--input_file", "-i", type=str,
default="luna_pinyin.dict.yaml",
help="input dictionary file")
parser.add_argument("--output_file", "-o", type=str, default="",
help="output dictionary file")
parser.add_argument("--pinyin", "-p", type=str, default="flypy",
choices=["flypy", "quanpin", "zrm"],
help="pinyin schema")
parser.add_argument("--shape", "-x", type=str, default="zrmfast",
choices=["flypy", "zrmfast"], help="shape schema")
parser.add_argument("--delimiter", "-d", type=str, default="[",
help="delimiter to seperate pinyin and shape")
args = parser.parse_args()
return args
def main() -> None:
args = get_cli_args()
pinyin_fn = get_pinyin_fn(args.pinyin)
shape_dict = get_shape_dict(args.shape)
delim = args.delimiter
with open(args.input_file, newline="", encoding='UTF-8') as f:
rows = list(csv.reader(f, delimiter="\t", quotechar="`"))
def code_fn(pinyin, hanzi):
if hanzi == "干" and pinyin == "qian":
hanzi = "乾"
return pinyin_fn(pinyin) + delim + shape_dict.get(hanzi, delim)
out_rows = [rewrite_row(row, code_fn) for row in rows]
output_file = args.output_file
if output_file == "":
_, input_postfix = args.input_file.split(".", maxsplit=1)
output_file = f"{args.pinyin}_{args.shape}.{input_postfix}"
with open(output_file, "w", newline="", encoding='UTF-8') as f:
my_tsv = csv.writer(f, delimiter="\t",
quotechar="`", lineterminator="\n")
my_tsv.writerows(out_rows)
if __name__ == "__main__":
main()