-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmap_char_to_tag.py
184 lines (159 loc) · 5.75 KB
/
map_char_to_tag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
어절 내에서 원문과 형태소 사이에 정렬을 수행하고 음절기반 학습 코퍼스를 생성
__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
"""
###########
# imports #
###########
from argparse import ArgumentParser, Namespace
from collections import defaultdict
import logging
import os
import random
import sys
from typing import Iterator, List
from src.rsc.sejong_corpus import Sentence, sents, Word
from src.rsc.char_align import Aligner, AlignError, MrpChr
#############
# variables #
#############
_MAP_DIC = defaultdict(list) # char/tag => morpheme result mapping dictionary
_MAP_CASE = defaultdict(list) # char/tag => case(word) dictionary
#############
# functions #
#############
def _print_restore_dic(args: Namespace):
"""
원형복원 사전을 출력한다.
Args:
args: program arguments
"""
if not args.restore_dic:
return
with open(args.restore_dic, 'w', encoding='UTF-8') as fout:
with open('{}.case'.format(args.restore_dic), 'w', encoding='UTF-8') as fcase:
for (char, tag), vals in _MAP_DIC.items():
for idx, val in enumerate(vals):
print('{}/{}:{}\t{}'.format(char, tag, idx, MrpChr.to_str(val)), file=fout)
word = _MAP_CASE[char, tag][idx]
print('{}/{}:{}\t{}'.format(char, tag, idx, word), file=fcase)
def sent_iter(args: Namespace) -> Iterator[Sentence]:
"""
sentence generator
Args:
args: program arguments
Yields:
sentence
"""
for name in sorted(os.listdir(args.corpus_dir)):
if not name.endswith('.txt'):
continue
logging.info(name)
path = '{}/{}'.format(args.corpus_dir, name)
for sent in sents(open(path, 'r', encoding='UTF-8')):
yield sent
def _maps_to_tag(word: Word, maps: List[List[MrpChr]]) -> List[str]:
"""
매핑 정보를 이용해 어절 내 각 음절 별로 출력 태그를 생성한다.
Args:
word: Word object
maps: 음절 별 매핑 정보
Returns:
출력 태그 리스트
"""
tags = []
for char, align in zip(word.raw, maps):
if len(align) == 1 and align[0].char == char:
tags.append(align[0].tag)
continue
tag = ':'.join([_.tag for _ in align])
if (char, tag) in _MAP_DIC:
vals = _MAP_DIC[char, tag]
try:
idx = vals.index(align)
except ValueError:
idx = len(vals)
vals.append(align)
_MAP_CASE[char, tag].append(word)
tag = '{}:{}'.format(tag, idx)
else:
_MAP_DIC[char, tag].append(align)
_MAP_CASE[char, tag].append(word)
tag = '{}:0'.format(tag)
tags.append(tag)
return tags
def _print_sent(sent: Sentence, word_per_maps: List[List[List[MrpChr]]]):
"""
각 어절 별 매핑 정보 리스트를 이용해 한 문장을 출력한다.
Args:
sent: Sentence object
word_per_maps: 각 어절별로 음절 매핑 정보를 포함하는 리스트
"""
lines = []
has_error = False
for word, maps in zip(sent.words, word_per_maps):
if maps:
if len(word.raw) == len(maps):
lines.append('{}\t{}'.format(word.raw, ' '.join(_maps_to_tag(word, maps))))
else:
raise RuntimeError('length of maps is different from length of word')
else:
has_error = True
logging.debug(word)
if not has_error:
print('\n'.join(lines))
print()
def run(args: Namespace):
"""
run function which is the start point of program
Args:
args: program arguments
"""
aligner = Aligner(args.rsc_src)
funmap = open(args.unmapped, 'w', encoding='UTF-8') if args.unmapped else None
for sent in sent_iter(args):
if 0.0 < args.sample < 1.0 and random.random() >= args.sample:
continue
word_per_maps = []
for word in sent.words:
try:
maps = aligner.align(word)
except AlignError as algn_err:
if funmap:
algn_err.add_msg(str(word))
print(algn_err, file=funmap)
maps = []
word_per_maps.append(maps)
_print_sent(sent, word_per_maps)
_print_restore_dic(args)
aligner.print_middle_cnt()
########
# main #
########
def main():
"""
main function processes only argument parsing
"""
parser = ArgumentParser(description='어절 내에서 원문과 형태소 사이에 정렬을 수행하고'
' 음절기반 학습 코퍼스를 생성')
parser.add_argument('-c', '--corpus-dir', help='corpus dir', metavar='DIR', required=True)
parser.add_argument('--rsc-src', help='resource source dir <default: ./rsc>',
metavar='DIR', default='./rsc')
parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
parser.add_argument('--restore-dic', help='restore dic output file', metavar='FILE')
parser.add_argument('--unmapped', help='unmapped log file', metavar='FILE')
parser.add_argument('--sample', help='sampling ratio', metavar='REAL', type=float, default=1.0)
parser.add_argument('--debug', help='enable debug', action='store_true')
args = parser.parse_args()
if args.output:
sys.stdout = open(args.output, 'w', encoding='UTF-8')
if args.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
run(args)
if __name__ == '__main__':
main()