Skip to content

Commit

Permalink
세종 코퍼스의 오류와 관련된 각종 스크립트 추가 #3
Browse files Browse the repository at this point in the history
  • Loading branch information
krikit committed Jan 28, 2019
1 parent b351a8f commit 767d057
Show file tree
Hide file tree
Showing 8 changed files with 781 additions and 18 deletions.
79 changes: 79 additions & 0 deletions munjong/bin/convert_jamo_to_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


"""
한글 자모 영역의 코드를 호환 영역으로 변환
__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
"""


###########
# imports #
###########
from argparse import ArgumentParser
import logging
import sys

from khaiii.munjong.sejong_corpus import WORD_ID_PTN
from khaiii.resource.jaso import norm_compat


#############
# functions #
#############
def _norm(text: str) -> str:
"""
정규화를 수행하는 함수
Args:
text: 입력 텍스트
Returns:
정규화된 텍스트
"""
normalized = norm_compat(text)
normalized = normalized.replace('ᆞ', 'ㆍ') # 0x119e -> 0x318d
normalized = normalized.replace('ᄝ', 'ㅱ') # 0x111d -> 0x3171
return normalized


def run():
"""
run function which is the start point of program
"""
for line in sys.stdin:
line = line.rstrip('\r\n')
if not WORD_ID_PTN.match(line):
print(line)
continue
wid, word, morph = line.split('\t')
print('{}\t{}\t{}'.format(wid, _norm(word), _norm(morph)))


########
# main #
########
def main():
"""
main function processes only argument parsing
"""
parser = ArgumentParser(description='한글 자모 영역의 코드를 호환 영역으로 변환')
parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
parser.add_argument('--debug', help='enable debug', action='store_true')
args = parser.parse_args()

if args.input:
sys.stdin = open(args.input, 'r', encoding='UTF-8')
if args.output:
sys.stdout = open(args.output, 'w', encoding='UTF-8')
if args.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)

run()


if __name__ == '__main__':
main()
106 changes: 106 additions & 0 deletions munjong/bin/detect_sejong_period_error.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


"""
detect period error of Sejong corpus
__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
"""


###########
# imports #
###########
from argparse import ArgumentParser
import logging
import os
import re
import sys
from typing import Iterator, TextIO, Tuple

from khaiii.munjong.sejong_corpus import Morph, WORD_ID_PTN


#############
# functions #
#############
def _get_two_lines(fin: TextIO) -> Iterator[Tuple[str, str]]:
"""
get two lines tuple from file (generator)
Args:
fin: input file
Yields:
current line
next line
"""
curr_line = fin.readline().rstrip('\r\n')
for next_line in fin:
next_line = next_line.rstrip('\r\n')
yield curr_line, next_line
curr_line = next_line


def _is_correct_eos(line: str) -> bool:
"""
whether correct end of sentence or not
Args:
line: line (word)
Returns:
whether correct or not
"""
_, _, morphs_str = line.split('\t')
if re.match(r'.+/EF \+ ./SF$', morphs_str):
return True
if re.match(r'.+/SF \+ [\'"’”」\]]/SS$', morphs_str):
return True
morphs = [Morph.parse(_) for _ in morphs_str.split(' + ')]
tags_str = '+'.join([_.tag for _ in morphs])
if tags_str.endswith('+SF+SS+JKQ') or tags_str.endswith('+SF+SS+VCP+ETM'):
return True
return False


def run():
"""
run function which is the start point of program
"""
file_name = os.path.basename(sys.stdin.name)
for line_num, (curr_line, next_line) in enumerate(_get_two_lines(sys.stdin), start=1):
cols = curr_line.split('\t')
if len(cols) != 3 or not WORD_ID_PTN.match(cols[0]):
continue
if '/SF + ' not in cols[2] or not next_line.startswith('</'):
continue
if _is_correct_eos(curr_line):
continue
print('{}:{}\t{}'.format(file_name, line_num, curr_line))


########
# main #
########
def main():
"""
main function processes only argument parsing
"""
parser = ArgumentParser(description='detect period error of Sejong corpus')
parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
parser.add_argument('--debug', help='enable debug', action='store_true')
args = parser.parse_args()

if args.input:
sys.stdin = open(args.input, 'rt')
if args.output:
sys.stdout = open(args.output, 'wt')
if args.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)

run()


if __name__ == '__main__':
main()
86 changes: 86 additions & 0 deletions munjong/bin/fix_final_symbol_error.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


"""
fix final symbol errors on Sejong corpus
__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
"""


###########
# imports #
###########
from argparse import ArgumentParser
import logging
import os
import sys

from khaiii.munjong.sejong_corpus import Morph, Word, WORD_ID_PTN


#############
# functions #
#############
def _attach_missing_symbol(word: Word):
"""
attach missing symbol
Args:
word: Word object
"""
raw_word = word.raw
raw_morph = ''.join([_.lex for _ in word.morphs])
if not raw_word.startswith(raw_morph) or len(raw_word) != len(raw_morph)+1:
return
last_symbol = raw_word[-1]
if last_symbol == '.' and word.morphs[-1].tag == 'EC':
word.morphs.append(Morph('.', 'SF'))
elif last_symbol == ',':
word.morphs.append(Morph(',', 'SP'))
elif last_symbol == '"':
word.morphs.append(Morph('"', 'SS'))


def run():
"""
run function which is the start point of program
"""
file_name = os.path.basename(sys.stdin.name)
for line_num, line in enumerate(sys.stdin, start=1):
line = line.rstrip('\r\n')
if not WORD_ID_PTN.match(line):
print(line)
continue
word = Word.parse(line, file_name, line_num)
_attach_missing_symbol(word)
print(word)


########
# main #
########
def main():
"""
main function processes only argument parsing
"""
parser = ArgumentParser(description='fix final symbol errors on Sejong corpus')
parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
parser.add_argument('--debug', help='enable debug', action='store_true')
args = parser.parse_args()

if args.input:
sys.stdin = open(args.input, 'rt')
if args.output:
sys.stdout = open(args.output, 'wt')
if args.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)

run()


if __name__ == '__main__':
main()
96 changes: 96 additions & 0 deletions munjong/bin/recover_english_case.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


"""
recover cases of English letters in Sejong corpus
__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
"""


###########
# imports #
###########
from argparse import ArgumentParser
import copy
import logging
import os
import re
import sys

from khaiii.munjong.sejong_corpus import Word, WORD_ID_PTN


#############
# functions #
#############
def _recover(word: Word):
"""
recover cases
Args:
word: Word object
"""
word_letters = [_ for _ in word.raw if re.match(r'[a-zA-Z]', _)]
letter_idx = -1
is_recovered = False
word_copy = copy.deepcopy(word)
for morph in word_copy.morphs:
for idx, char in enumerate(morph.lex):
if not re.match(r'[a-zA-Z]', char):
continue
letter_idx += 1
if word_letters[letter_idx] == char:
continue
morph.lex = morph.lex[:idx] + word_letters[letter_idx] + morph.lex[idx+1:]
is_recovered = True
if is_recovered:
logging.info('%s => %s', str(word), word_copy.morph_str())
word.morphs = word_copy.morphs


def run():
"""
run function which is the start point of program
"""
file_name = os.path.basename(sys.stdin.name)
for line_num, line in enumerate(sys.stdin, start=1):
line = line.rstrip('\r\n')
if not WORD_ID_PTN.match(line):
print(line)
continue
word = Word.parse(line, file_name, line_num)
try:
_recover(word)
except IndexError as idx_err:
logging.error('%s(%d): %s: %s', file_name, line_num, idx_err, word)
print(word)


########
# main #
########
def main():
"""
main function processes only argument parsing
"""
parser = ArgumentParser(description='recover cases of English letters in Sejong corpus')
parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
parser.add_argument('--debug', help='enable debug', action='store_true')
args = parser.parse_args()

if args.input:
sys.stdin = open(args.input, 'r', encoding='UTF-8')
if args.output:
sys.stdout = open(args.output, 'w', encoding='UTF-8')
if args.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)

run()


if __name__ == '__main__':
main()
Loading

0 comments on commit 767d057

Please sign in to comment.