-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathwip2dict.ls
251 lines (231 loc) · 6.46 KB
/
wip2dict.ls
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
it = meng!
require! fs
const ABBREV = {
"南四縣": \南
"四縣音": \四
"海陸音": \海
"大埔音": \大
"饒平音": \平
"詔安音": \安
}
const PUA = pua!
sounds = ->
it.replace(/1\s*/g '¹')
.replace(/2\s*/g '²')
.replace(/3\s*/g '³')
.replace(/4\s*/g '⁴')
.replace(/5\s*/g '⁵')
bracketed = ->
x = it.replace(
/([〔﹝]\s*又讀\s*|(|這粒)(.*?(?:[12345][12345]|[四海大平安]|lin53go11)\s*)\s*([〕)﹞﹝])/g
(_, pre, inner, post) -> "#pre#{
sounds(inner).replace(
/([^南四海大平安]+)([南四海大平安])\s*/g
(_, snd, variant) -> "#variant\u20E3#{snd}、"
) - /\s*、\s*$/
}#post"
).replace(
/((?:\w+\d\d)+)([南四海大平安]?)(\s+)?/g
(_, snd, variant='', spc='') ->
spc = \、 if spc
snd = sounds snd
variant += "\u20E3" if variant
"#variant#snd#spc"
)
return x
def = ->
if it.example
it.example = [ bracketed(e.join '').replace(/(([^)][^)]+))/, '$1') - /^\s*/ - /\s*$/ for e in it.example ]
if it.example.length is 2 and it.example.1 is /^\uFFF9(.)\uFFFB$/
punct = RegExp.$1
it.example.0 += punct
it.example.0 .=replace(/\uFFFB/ "#punct\uFFFB")
it.example.pop!
delete it.example unless it.example?length
it.def += '。' unless it.def is /[。,、;:?!─….·-」』》〉]$/
it.def -= /^\d+\.\s*/
it.def = bracketed it.def
it
py = ->
m = []
for x in ["四縣音" "海陸音" "大埔音" "饒平音" "詔安音" "南四縣"] | it[x]
m.push(ABBREV[x] + '\u20DE' + sounds(it[x] - /(.*)/ - /\s/g))
m * ' '
norm = -> (it || '') - /【/ - /】/
m2t = -> {
title: norm(it['詞目'])
heteronyms: [ {
audio_id: (100000 + Number(it['檔名'] - /\D/g)) - /^1/
pinyin: py it
synonyms: [ bracketed norm(x - /^\d+\.\s*/) for x in it['近義詞'] / \、] * \,
antonyms: [ bracketed norm(x - /^\d+\.\s*/) for x in it['反義詞'] / \、] * \,
definitions: [ def d <<< { type: "#{ it['詞性'] || '' }".replace(/ /g \,) } for d in it['釋義'] | d is /\S\S/ ]
} ]
}
flatten = (xs) -> [].concat.apply [], [(if x.length? then flatten x else x) for x in xs]
sort-by = (f, xs=[]) -> xs.concat!.sort (x, y) ->
if (f x) > (f y) => 1
else if (f x) < (f y) => -1
else => 0
WIP = JSON.parse fs.read-file-sync(\work-in-progress.json \utf8).replace(
/\{\[(....)\]\}/g
(_, $1) -> PUA[$1] || do ->
console.log $1
process.exit $1
)
HETERONYMS = {}
for w in WIP | w['詞目']
{title, heteronyms} = m2t w
HETERONYMS[title] ||= []
HETERONYMS[title].push(for h in heteronyms
delete h.synonyms unless h.synonyms
delete h.antonyms unless h.antonyms
h
)
unless process.env.H2M or process.env.M2H
console.log JSON.stringify(for title in Object.keys(HETERONYMS).sort!
{ title, heteronyms: flatten sort-by( ((.0.audio_id) >> Number), HETERONYMS[title] ) }
)
process.exit!
if process.env.M2H
index = fs.read-file-sync "/Users/audreyt/w/moedict-webkit/a/index.json" \utf8
m2h = {}
for w in WIP | w['對應華語']
title = norm(w['詞目'])
m = ",#{ w['對應華語'].replace(/、/g \,).replace(/ /g \,).replace(/\d+\./g '') },"
m -= /^,+/
m -= /,+$/
for t in m / \,
continue unless ~index.indexOf("\"#t\"")
h = if title is t then '' else title
if t of m2h
m2h[t] += ",#h"
else
m2h[t] = h
console.log JSON.stringify h: m2h
process.exit!
console.log JSON.stringify a: h2m
LTM-regexes = []
autolink = (chunk) ->
for re in LTM-regexes
chunk.=replace(re, -> escape "`#it~")
return unescape chunk
require! fs
pre2 = JSON.parse fs.read-file-sync "/Users/audreyt/w/moedict-webkit/a/lenToRegex.json" \utf8
lenToRegex = pre2.lenToRegex
lens = []
for len of lenToRegex
lens.push len
lenToRegex[len] = new RegExp lenToRegex[len], \g
lens.sort (a, b) -> b - a
for len in lens
LTM-regexes.push lenToRegex[len]
# H2M
h2m = {}
for w in WIP | w['對應華語']
title = norm(w['詞目'])
m = ",#{ w['對應華語'].replace(/、/g \,).replace(/ /g \,).replace(/\d+\./g '') },"
m -= /^,+/
m -= /,+$/
h2m[title] = (for t in m / \,
x = autolink t
if x is "`#title~" then "" else x
) * \,
console.log JSON.stringify a: h2m
process.exit!
console.log JSON.stringify {
"title": "發芽",
"heteronyms": [ {
"synonyms": "暴芽,暴筍",
"pinyin": "四\u20DEfad²nga¹¹ 海\u20DEfad⁵nga⁵⁵ 大\u20DEfad²¹nga¹¹³ 平\u20DEfad²⁴nga⁵³ ",
"definitions": [
"example": [
"\uFFF9春天一到,草仔樹仔相賽開始發芽。\uFFFB春天一到,草木相繼開始萌芽。"
]
"def": "植物的種子,因本身的生理、外部環境條件的合適,而開始萌發的一種現象"
"type": "動"
]
} ]
}
function meng => {
"多音字": "",
"四縣音": "\n\n\n\nfad2nga11",
"海陸音": "\n\n\n\nfad5nga55",
"又音": "",
"釋義": [
{
"example": [
[
"\uFFF9春天一到,草仔樹仔相賽開始發芽。",
"\uFFFB(春天一到,草木相繼開始萌芽。)"
]
],
"def": "植物的種子,因本身的生理、外部環境條件的合適,而開始萌發的一種現象"
}
],
"大埔音": "\n\n\n\nfad21nga113",
"對應華語": "萌芽",
"近義詞": "【暴芽】、【暴筍】",
"詞性": "動",
"詞目": "【發芽】",
"反義詞": "",
"文白讀": "",
"饒平音": "\n\n\n\nfad24nga53"
}
function pua => {
"2430": "𤌍",
"2A61": "𪘒",
"3614": "㘔",
"39FE": "㧾",
"3F13": "㼓",
"3F8A": "㾊",
"F305": "𠊎",
"F307": "𫣆",
"F30E": "𢼛",
"F315": "𢫦",
"F34E": "㧡",
"F34F": "䟓",
"F350": "𠲿",
"F354": "䞚",
"F357": "㬹",
"F35A": "𢱤",
"F360": "𪐞",
"F369": "𧊅",
"F36B": "𤊶",
"F36C": "𥯟",
"F36D": "𠠃",
"F36E": "𧩣",
"F36F": "𩜰",
"F372": "𥍉",
"F374": "𢯭",
"F377": "㪐",
"F379": "𣲩",
"F37B": "𥺆",
"F37C": "𣼎",
"F37D": "𣛮",
"F37E": "𨒇",
"F383": "𤐰",
"F385": "𤸁",
"F390": "𢳆",
"F397": "𥯥",
"F39A": "䟘",
"F39B": "𠖄",
"F3A5": "𤸱",
"F3B4": "䀯",
"F3B5": "𪖐",
"F3B9": "𥉌",
"F3C9": "𠜱",
"F401": "𤘅",
"F414": "𩜄",
"F433": "𤍒",
"F434": "𨃰",
"F437": "𠗻",
"F442": "𬠖",
"F444": "𫟧",
"F446": "𫝘",
"F448": "䯋",
"F44F": "𠎷",
"F463": "㗘",
"F488": "𠠝",
"F545": "⿺皮卜"
}