-
Notifications
You must be signed in to change notification settings - Fork 295
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
781 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
|
||
|
||
""" | ||
한글 자모 영역의 코드를 호환 영역으로 변환 | ||
__author__ = 'Jamie (jamie.lim@kakaocorp.com)' | ||
__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' | ||
""" | ||
|
||
|
||
########### | ||
# imports # | ||
########### | ||
from argparse import ArgumentParser | ||
import logging | ||
import sys | ||
|
||
from khaiii.munjong.sejong_corpus import WORD_ID_PTN | ||
from khaiii.resource.jaso import norm_compat | ||
|
||
|
||
############# | ||
# functions # | ||
############# | ||
def _norm(text: str) -> str: | ||
""" | ||
정규화를 수행하는 함수 | ||
Args: | ||
text: 입력 텍스트 | ||
Returns: | ||
정규화된 텍스트 | ||
""" | ||
normalized = norm_compat(text) | ||
normalized = normalized.replace('ᆞ', 'ㆍ') # 0x119e -> 0x318d | ||
normalized = normalized.replace('ᄝ', 'ㅱ') # 0x111d -> 0x3171 | ||
return normalized | ||
|
||
|
||
def run(): | ||
""" | ||
run function which is the start point of program | ||
""" | ||
for line in sys.stdin: | ||
line = line.rstrip('\r\n') | ||
if not WORD_ID_PTN.match(line): | ||
print(line) | ||
continue | ||
wid, word, morph = line.split('\t') | ||
print('{}\t{}\t{}'.format(wid, _norm(word), _norm(morph))) | ||
|
||
|
||
######## | ||
# main # | ||
######## | ||
def main(): | ||
""" | ||
main function processes only argument parsing | ||
""" | ||
parser = ArgumentParser(description='한글 자모 영역의 코드를 호환 영역으로 변환') | ||
parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE') | ||
parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE') | ||
parser.add_argument('--debug', help='enable debug', action='store_true') | ||
args = parser.parse_args() | ||
|
||
if args.input: | ||
sys.stdin = open(args.input, 'r', encoding='UTF-8') | ||
if args.output: | ||
sys.stdout = open(args.output, 'w', encoding='UTF-8') | ||
if args.debug: | ||
logging.basicConfig(level=logging.DEBUG) | ||
else: | ||
logging.basicConfig(level=logging.INFO) | ||
|
||
run() | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
|
||
|
||
""" | ||
detect period error of Sejong corpus | ||
__author__ = 'Jamie (jamie.lim@kakaocorp.com)' | ||
__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' | ||
""" | ||
|
||
|
||
########### | ||
# imports # | ||
########### | ||
from argparse import ArgumentParser | ||
import logging | ||
import os | ||
import re | ||
import sys | ||
from typing import Iterator, TextIO, Tuple | ||
|
||
from khaiii.munjong.sejong_corpus import Morph, WORD_ID_PTN | ||
|
||
|
||
############# | ||
# functions # | ||
############# | ||
def _get_two_lines(fin: TextIO) -> Iterator[Tuple[str, str]]: | ||
""" | ||
get two lines tuple from file (generator) | ||
Args: | ||
fin: input file | ||
Yields: | ||
current line | ||
next line | ||
""" | ||
curr_line = fin.readline().rstrip('\r\n') | ||
for next_line in fin: | ||
next_line = next_line.rstrip('\r\n') | ||
yield curr_line, next_line | ||
curr_line = next_line | ||
|
||
|
||
def _is_correct_eos(line: str) -> bool: | ||
""" | ||
whether correct end of sentence or not | ||
Args: | ||
line: line (word) | ||
Returns: | ||
whether correct or not | ||
""" | ||
_, _, morphs_str = line.split('\t') | ||
if re.match(r'.+/EF \+ ./SF$', morphs_str): | ||
return True | ||
if re.match(r'.+/SF \+ [\'"’”」\]]/SS$', morphs_str): | ||
return True | ||
morphs = [Morph.parse(_) for _ in morphs_str.split(' + ')] | ||
tags_str = '+'.join([_.tag for _ in morphs]) | ||
if tags_str.endswith('+SF+SS+JKQ') or tags_str.endswith('+SF+SS+VCP+ETM'): | ||
return True | ||
return False | ||
|
||
|
||
def run(): | ||
""" | ||
run function which is the start point of program | ||
""" | ||
file_name = os.path.basename(sys.stdin.name) | ||
for line_num, (curr_line, next_line) in enumerate(_get_two_lines(sys.stdin), start=1): | ||
cols = curr_line.split('\t') | ||
if len(cols) != 3 or not WORD_ID_PTN.match(cols[0]): | ||
continue | ||
if '/SF + ' not in cols[2] or not next_line.startswith('</'): | ||
continue | ||
if _is_correct_eos(curr_line): | ||
continue | ||
print('{}:{}\t{}'.format(file_name, line_num, curr_line)) | ||
|
||
|
||
######## | ||
# main # | ||
######## | ||
def main(): | ||
""" | ||
main function processes only argument parsing | ||
""" | ||
parser = ArgumentParser(description='detect period error of Sejong corpus') | ||
parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE') | ||
parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE') | ||
parser.add_argument('--debug', help='enable debug', action='store_true') | ||
args = parser.parse_args() | ||
|
||
if args.input: | ||
sys.stdin = open(args.input, 'rt') | ||
if args.output: | ||
sys.stdout = open(args.output, 'wt') | ||
if args.debug: | ||
logging.basicConfig(level=logging.DEBUG) | ||
else: | ||
logging.basicConfig(level=logging.INFO) | ||
|
||
run() | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
|
||
|
||
""" | ||
fix final symbol errors on Sejong corpus | ||
__author__ = 'Jamie (jamie.lim@kakaocorp.com)' | ||
__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' | ||
""" | ||
|
||
|
||
########### | ||
# imports # | ||
########### | ||
from argparse import ArgumentParser | ||
import logging | ||
import os | ||
import sys | ||
|
||
from khaiii.munjong.sejong_corpus import Morph, Word, WORD_ID_PTN | ||
|
||
|
||
############# | ||
# functions # | ||
############# | ||
def _attach_missing_symbol(word: Word): | ||
""" | ||
attach missing symbol | ||
Args: | ||
word: Word object | ||
""" | ||
raw_word = word.raw | ||
raw_morph = ''.join([_.lex for _ in word.morphs]) | ||
if not raw_word.startswith(raw_morph) or len(raw_word) != len(raw_morph)+1: | ||
return | ||
last_symbol = raw_word[-1] | ||
if last_symbol == '.' and word.morphs[-1].tag == 'EC': | ||
word.morphs.append(Morph('.', 'SF')) | ||
elif last_symbol == ',': | ||
word.morphs.append(Morph(',', 'SP')) | ||
elif last_symbol == '"': | ||
word.morphs.append(Morph('"', 'SS')) | ||
|
||
|
||
def run(): | ||
""" | ||
run function which is the start point of program | ||
""" | ||
file_name = os.path.basename(sys.stdin.name) | ||
for line_num, line in enumerate(sys.stdin, start=1): | ||
line = line.rstrip('\r\n') | ||
if not WORD_ID_PTN.match(line): | ||
print(line) | ||
continue | ||
word = Word.parse(line, file_name, line_num) | ||
_attach_missing_symbol(word) | ||
print(word) | ||
|
||
|
||
######## | ||
# main # | ||
######## | ||
def main(): | ||
""" | ||
main function processes only argument parsing | ||
""" | ||
parser = ArgumentParser(description='fix final symbol errors on Sejong corpus') | ||
parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE') | ||
parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE') | ||
parser.add_argument('--debug', help='enable debug', action='store_true') | ||
args = parser.parse_args() | ||
|
||
if args.input: | ||
sys.stdin = open(args.input, 'rt') | ||
if args.output: | ||
sys.stdout = open(args.output, 'wt') | ||
if args.debug: | ||
logging.basicConfig(level=logging.DEBUG) | ||
else: | ||
logging.basicConfig(level=logging.INFO) | ||
|
||
run() | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
|
||
|
||
""" | ||
recover cases of English letters in Sejong corpus | ||
__author__ = 'Jamie (jamie.lim@kakaocorp.com)' | ||
__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' | ||
""" | ||
|
||
|
||
########### | ||
# imports # | ||
########### | ||
from argparse import ArgumentParser | ||
import copy | ||
import logging | ||
import os | ||
import re | ||
import sys | ||
|
||
from khaiii.munjong.sejong_corpus import Word, WORD_ID_PTN | ||
|
||
|
||
############# | ||
# functions # | ||
############# | ||
def _recover(word: Word): | ||
""" | ||
recover cases | ||
Args: | ||
word: Word object | ||
""" | ||
word_letters = [_ for _ in word.raw if re.match(r'[a-zA-Z]', _)] | ||
letter_idx = -1 | ||
is_recovered = False | ||
word_copy = copy.deepcopy(word) | ||
for morph in word_copy.morphs: | ||
for idx, char in enumerate(morph.lex): | ||
if not re.match(r'[a-zA-Z]', char): | ||
continue | ||
letter_idx += 1 | ||
if word_letters[letter_idx] == char: | ||
continue | ||
morph.lex = morph.lex[:idx] + word_letters[letter_idx] + morph.lex[idx+1:] | ||
is_recovered = True | ||
if is_recovered: | ||
logging.info('%s => %s', str(word), word_copy.morph_str()) | ||
word.morphs = word_copy.morphs | ||
|
||
|
||
def run(): | ||
""" | ||
run function which is the start point of program | ||
""" | ||
file_name = os.path.basename(sys.stdin.name) | ||
for line_num, line in enumerate(sys.stdin, start=1): | ||
line = line.rstrip('\r\n') | ||
if not WORD_ID_PTN.match(line): | ||
print(line) | ||
continue | ||
word = Word.parse(line, file_name, line_num) | ||
try: | ||
_recover(word) | ||
except IndexError as idx_err: | ||
logging.error('%s(%d): %s: %s', file_name, line_num, idx_err, word) | ||
print(word) | ||
|
||
|
||
######## | ||
# main # | ||
######## | ||
def main(): | ||
""" | ||
main function processes only argument parsing | ||
""" | ||
parser = ArgumentParser(description='recover cases of English letters in Sejong corpus') | ||
parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE') | ||
parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE') | ||
parser.add_argument('--debug', help='enable debug', action='store_true') | ||
args = parser.parse_args() | ||
|
||
if args.input: | ||
sys.stdin = open(args.input, 'r', encoding='UTF-8') | ||
if args.output: | ||
sys.stdout = open(args.output, 'w', encoding='UTF-8') | ||
if args.debug: | ||
logging.basicConfig(level=logging.DEBUG) | ||
else: | ||
logging.basicConfig(level=logging.INFO) | ||
|
||
run() | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.