-
Notifications
You must be signed in to change notification settings - Fork 24
/
2014.pl
100 lines (92 loc) · 5.19 KB
/
2014.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
use utf8;
use strict;
use Text::CSV_XS;
use JSON::XS;
open my $ch, '<:utf8', '字頭部首筆畫.csv';
<$ch>;
my $csv_ch = Text::CSV_XS->new ({ binary => 1 });
my %CH;
while (my $row = $csv_ch->getline($ch)) {
#cnscode,educode,字頭,部首,部首代碼,部首外筆畫,總筆畫
my (undef, undef, $ch, $rad, undef, $nrsc, $sc) = @$row;
$CH{$ch} = { radical => $rad, non_radical_stroke_count => int $nrsc, stroke_count => int $sc };
}
# 字詞流水序 正體字形 簡化字形 音序 臺/陸特有詞 臺/陸特有音 臺灣音讀 臺灣漢拼 大陸音讀 大陸漢拼 釋義1 釋義2 釋義3 釋 義4 釋義5 釋義6 釋義7 釋義8 釋義9 釋義10 釋義11 釋義12 釋義13 釋義14 釋義15 釋義16 釋義17 釋義18 釋義19 釋義20 釋義21 釋義22 釋義23 釋義2 4 釋義25 釋義26 釋義27 釋義28 釋義29 釋義 30
# 稿件階段,稿件狀態,備注,字詞流水序,正體字形,簡化字形,音序,臺/陸特有詞,臺/陸特有音,臺灣音讀,臺灣漢拼,大陸音讀,大陸漢拼,釋義1,釋義2,釋義3,釋義4,釋義5,釋義6,釋義7,釋義8,釋義9,釋義10,釋義11,釋義12,釋義13,釋義14,釋義15,釋義16,釋義17,釋義18,釋義19,釋義20,釋義21,釋義22,釋義23,釋義24,釋義25,釋義26,釋義27,釋義28,釋義29,釋義30
# 稿件版本,稿件階段,稿件狀態,備注,字詞流水序,正體字形,簡化字形,音序,臺/陸特有詞,臺/陸特有音,臺灣音讀,臺灣漢拼,大陸音讀,大陸漢拼,釋義1,釋義2,釋義3,釋義4,釋義5,釋義6,釋義7,釋義8,釋義9,釋義10,釋義11,釋義12,釋義13,釋義14,釋義15,釋義16,釋義17,釋義18,釋義19,釋義20,釋義21,釋義22,釋義23,釋義24,釋義25,釋義26,釋義27,釋義28,釋義29,釋義30
open my $fh, '<:utf8', '兩岸詞典.csv';
binmode STDERR, ':utf8';
binmode STDOUT, ':utf8';
<$fh>;
my %heteronyms;
my %alt;
my %seen;
my $csv = Text::CSV_XS->new ({ binary => 1 });
while (my $row = $csv->getline ($fh)) {
my ($version, $phase, $state, $id, $title) = @$row;
# 稿件版本,稿件階段,稿件狀態,備注,字詞流水序, 正體字形,簡化字形,音序,臺/陸特有詞,臺/陸特有音,臺灣音讀,臺灣漢拼,大陸音讀,大陸漢拼,釋義1,釋義2,釋義3,釋義4,釋義5,釋義6,釋義7,釋義8,釋義9,釋義10,釋義11,釋義12,釋義13,釋義14,釋義15,釋義16,釋義17,釋義18,釋義19,釋義20,釋義21,釋義22,釋義23,釋義24,釋義25,釋義26,釋義27,釋義28,釋義29,釋義30
my (undef, undef, undef, undef, undef, $title_cn, $seq_sound, $spec_word, $spec_sound, $bpmf, $pinyin, $bpmf_cn, $pinyin_cn, @defs) = map {
s/ɡ/g/g;
s/[〜~]/$title/g;
s/○○頁//g;
s/""/"/g;
s/>\s*</></g;
s/\r//g;
s/★/陸\x{20DD}/g;
s/▲/臺\x{20DD}/g;
$_; } @$row;
$bpmf =~ s/丨/ㄧ/g;
$bpmf_cn =~ s/丨/ㄧ/g;
$spec_word =~ s/\x{20DD}/\x{20DF}/g if $spec_word;
# TODO: <<詞條較長時陸音哪裡發音不同,需要視覺化>>
if ($spec_sound) {
$spec_sound =~ s/\x{20DD}/\x{20DF}/g;
$bpmf = "$bpmf$bpmf_cn$spec_sound";
$pinyin = "$pinyin$pinyin_cn$spec_sound";
}
else {
$bpmf .= "<br>陸\x{20DD}$bpmf_cn" unless !$bpmf_cn or $bpmf eq $bpmf_cn;
$pinyin .= "<br>陸\x{20DD}$pinyin_cn" unless !$pinyin_cn or $pinyin eq $pinyin_cn;
}
$seen{$title}++;
undef $title_cn if $title_cn eq $title;
$alt{$title} = $title_cn if $title_cn;
push @{ $heteronyms{$title} }, {
id => $id,
pinyin => $pinyin,
bopomofo => $bpmf,
($title_cn ? (alt => $alt{$title}) : ()),
($spec_word ? (specific_to => $spec_word) : ()),
definitions => [ map {
my %entry;
s/^\d+\.\s*//;
s/^\s*"+\s*//g;
s/\s*"+\s*$//g;
if (s/[[\[]例[]\]]([^。]+)。?//) {
$entry{example} = [ "例\x{20DD}" . join('、', map "「$_」", split /[|︱│\∣]/, $1) . "。" ];
}
s/[\[[]([^\x00-\xff])[]\]]/$1\x{20DD}/g;
$entry{def} = $_;
\%entry
} grep {/\S/} @defs
]
};
}
my $comma = '[';
for my $title (sort keys %heteronyms) {
my $json = JSON::XS->new->pretty(1)->canonical->encode({
title => $title,
heteronyms => $heteronyms{$title},
%{$CH{$title} || {}},
});
$json =~ s/" : /":/g;
print "$comma $json";
$comma = ',';
}
print "]";
my $comma = '[';
for (sort keys %seen) {
warn qq[$comma "$_"\n];
$comma = ',';
}
warn "]\n";