세종 코퍼스의 오류와 관련된 각종 스크립트 추가 #3

kakao · Jan 28, 2019 · 767d057 · 767d057
1 parent b351a8f
commit 767d057
Show file tree

Hide file tree

Showing 8 changed files with 781 additions and 18 deletions.
diff --git a/munjong/bin/convert_jamo_to_compat.py b/munjong/bin/convert_jamo_to_compat.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+한글 자모 영역의 코드를 호환 영역으로 변환
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import logging
+import sys
+
+from khaiii.munjong.sejong_corpus import WORD_ID_PTN
+from khaiii.resource.jaso import norm_compat
+
+
+#############
+# functions #
+#############
+def _norm(text: str) -> str:
+    """
+    정규화를 수행하는 함수
+    Args:
+        text:  입력 텍스트
+    Returns:
+        정규화된 텍스트
+    """
+    normalized = norm_compat(text)
+    normalized = normalized.replace('ᆞ', 'ㆍ')    # 0x119e -> 0x318d
+    normalized = normalized.replace('ᄝ', 'ㅱ')    # 0x111d -> 0x3171
+    return normalized
+
+
+def run():
+    """
+    run function which is the start point of program
+    """
+    for line in sys.stdin:
+        line = line.rstrip('\r\n')
+        if not WORD_ID_PTN.match(line):
+            print(line)
+            continue
+        wid, word, morph = line.split('\t')
+        print('{}\t{}\t{}'.format(wid, _norm(word), _norm(morph)))
+
+
+########
+# main #
+########
+def main():
+    """
+    main function processes only argument parsing
+    """
+    parser = ArgumentParser(description='한글 자모 영역의 코드를 호환 영역으로 변환')
+    parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
+    parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
+    parser.add_argument('--debug', help='enable debug', action='store_true')
+    args = parser.parse_args()
+
+    if args.input:
+        sys.stdin = open(args.input, 'r', encoding='UTF-8')
+    if args.output:
+        sys.stdout = open(args.output, 'w', encoding='UTF-8')
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    run()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/munjong/bin/detect_sejong_period_error.py b/munjong/bin/detect_sejong_period_error.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+detect period error of Sejong corpus
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import logging
+import os
+import re
+import sys
+from typing import Iterator, TextIO, Tuple
+
+from khaiii.munjong.sejong_corpus import Morph, WORD_ID_PTN
+
+
+#############
+# functions #
+#############
+def _get_two_lines(fin: TextIO) -> Iterator[Tuple[str, str]]:
+    """
+    get two lines tuple from file (generator)
+    Args:
+        fin:  input file
+    Yields:
+        current line
+        next line
+    """
+    curr_line = fin.readline().rstrip('\r\n')
+    for next_line in fin:
+        next_line = next_line.rstrip('\r\n')
+        yield curr_line, next_line
+        curr_line = next_line
+
+
+def _is_correct_eos(line: str) -> bool:
+    """
+    whether correct end of sentence or not
+    Args:
+        line:  line (word)
+    Returns:
+        whether correct or not
+    """
+    _, _, morphs_str = line.split('\t')
+    if re.match(r'.+/EF \+ ./SF$', morphs_str):
+        return True
+    if re.match(r'.+/SF \+ [\'"’”」\]]/SS$', morphs_str):
+        return True
+    morphs = [Morph.parse(_) for _ in morphs_str.split(' + ')]
+    tags_str = '+'.join([_.tag for _ in morphs])
+    if tags_str.endswith('+SF+SS+JKQ') or tags_str.endswith('+SF+SS+VCP+ETM'):
+        return True
+    return False
+
+
+def run():
+    """
+    run function which is the start point of program
+    """
+    file_name = os.path.basename(sys.stdin.name)
+    for line_num, (curr_line, next_line) in enumerate(_get_two_lines(sys.stdin), start=1):
+        cols = curr_line.split('\t')
+        if len(cols) != 3 or not WORD_ID_PTN.match(cols[0]):
+            continue
+        if '/SF + ' not in cols[2] or not next_line.startswith('</'):
+            continue
+        if _is_correct_eos(curr_line):
+            continue
+        print('{}:{}\t{}'.format(file_name, line_num, curr_line))
+
+
+########
+# main #
+########
+def main():
+    """
+    main function processes only argument parsing
+    """
+    parser = ArgumentParser(description='detect period error of Sejong corpus')
+    parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
+    parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
+    parser.add_argument('--debug', help='enable debug', action='store_true')
+    args = parser.parse_args()
+
+    if args.input:
+        sys.stdin = open(args.input, 'rt')
+    if args.output:
+        sys.stdout = open(args.output, 'wt')
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    run()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/munjong/bin/fix_final_symbol_error.py b/munjong/bin/fix_final_symbol_error.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+fix final symbol errors on Sejong corpus
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import logging
+import os
+import sys
+
+from khaiii.munjong.sejong_corpus import Morph, Word, WORD_ID_PTN
+
+
+#############
+# functions #
+#############
+def _attach_missing_symbol(word: Word):
+    """
+    attach missing symbol
+    Args:
+        word:  Word object
+    """
+    raw_word = word.raw
+    raw_morph = ''.join([_.lex for _ in word.morphs])
+    if not raw_word.startswith(raw_morph) or len(raw_word) != len(raw_morph)+1:
+        return
+    last_symbol = raw_word[-1]
+    if last_symbol == '.' and word.morphs[-1].tag == 'EC':
+        word.morphs.append(Morph('.', 'SF'))
+    elif last_symbol == ',':
+        word.morphs.append(Morph(',', 'SP'))
+    elif last_symbol == '"':
+        word.morphs.append(Morph('"', 'SS'))
+
+
+def run():
+    """
+    run function which is the start point of program
+    """
+    file_name = os.path.basename(sys.stdin.name)
+    for line_num, line in enumerate(sys.stdin, start=1):
+        line = line.rstrip('\r\n')
+        if not WORD_ID_PTN.match(line):
+            print(line)
+            continue
+        word = Word.parse(line, file_name, line_num)
+        _attach_missing_symbol(word)
+        print(word)
+
+
+########
+# main #
+########
+def main():
+    """
+    main function processes only argument parsing
+    """
+    parser = ArgumentParser(description='fix final symbol errors on Sejong corpus')
+    parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
+    parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
+    parser.add_argument('--debug', help='enable debug', action='store_true')
+    args = parser.parse_args()
+
+    if args.input:
+        sys.stdin = open(args.input, 'rt')
+    if args.output:
+        sys.stdout = open(args.output, 'wt')
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    run()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/munjong/bin/recover_english_case.py b/munjong/bin/recover_english_case.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+recover cases of English letters in Sejong corpus
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import copy
+import logging
+import os
+import re
+import sys
+
+from khaiii.munjong.sejong_corpus import Word, WORD_ID_PTN
+
+
+#############
+# functions #
+#############
+def _recover(word: Word):
+    """
+    recover cases
+    Args:
+        word:  Word object
+    """
+    word_letters = [_ for _ in word.raw if re.match(r'[a-zA-Z]', _)]
+    letter_idx = -1
+    is_recovered = False
+    word_copy = copy.deepcopy(word)
+    for morph in word_copy.morphs:
+        for idx, char in enumerate(morph.lex):
+            if not re.match(r'[a-zA-Z]', char):
+                continue
+            letter_idx += 1
+            if word_letters[letter_idx] == char:
+                continue
+            morph.lex = morph.lex[:idx] + word_letters[letter_idx] + morph.lex[idx+1:]
+            is_recovered = True
+    if is_recovered:
+        logging.info('%s  =>  %s', str(word), word_copy.morph_str())
+        word.morphs = word_copy.morphs
+
+
+def run():
+    """
+    run function which is the start point of program
+    """
+    file_name = os.path.basename(sys.stdin.name)
+    for line_num, line in enumerate(sys.stdin, start=1):
+        line = line.rstrip('\r\n')
+        if not WORD_ID_PTN.match(line):
+            print(line)
+            continue
+        word = Word.parse(line, file_name, line_num)
+        try:
+            _recover(word)
+        except IndexError as idx_err:
+            logging.error('%s(%d): %s: %s', file_name, line_num, idx_err, word)
+        print(word)
+
+
+########
+# main #
+########
+def main():
+    """
+    main function processes only argument parsing
+    """
+    parser = ArgumentParser(description='recover cases of English letters in Sejong corpus')
+    parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
+    parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
+    parser.add_argument('--debug', help='enable debug', action='store_true')
+    args = parser.parse_args()
+
+    if args.input:
+        sys.stdin = open(args.input, 'r', encoding='UTF-8')
+    if args.output:
+        sys.stdout = open(args.output, 'w', encoding='UTF-8')
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    run()
+
+
+if __name__ == '__main__':
+    main()