From 767d05734b845d89ed51e9d605d20217116842e4 Mon Sep 17 00:00:00 2001 From: Jamie Date: Mon, 28 Jan 2019 20:21:51 +0900 Subject: [PATCH] =?UTF-8?q?=EC=84=B8=EC=A2=85=20=EC=BD=94=ED=8D=BC?= =?UTF-8?q?=EC=8A=A4=EC=9D=98=20=EC=98=A4=EB=A5=98=EC=99=80=20=EA=B4=80?= =?UTF-8?q?=EB=A0=A8=EB=90=9C=20=EA=B0=81=EC=A2=85=20=EC=8A=A4=ED=81=AC?= =?UTF-8?q?=EB=A6=BD=ED=8A=B8=20=EC=B6=94=EA=B0=80=20#3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- munjong/bin/convert_jamo_to_compat.py | 79 ++++++++++++++ munjong/bin/detect_sejong_period_error.py | 106 +++++++++++++++++++ munjong/bin/fix_final_symbol_error.py | 86 ++++++++++++++++ munjong/bin/recover_english_case.py | 96 +++++++++++++++++ munjong/bin/recover_raw_morph_mismatch.py | 101 ++++++++++++++++++ munjong/bin/recover_wide_quotation.py | 108 +++++++++++++++++++ munjong/bin/remove_sejong_period_error.py | 120 ++++++++++++++++++++++ src/main/python/khaiii/resource/jaso.py | 103 +++++++++++++++---- 8 files changed, 781 insertions(+), 18 deletions(-) create mode 100755 munjong/bin/convert_jamo_to_compat.py create mode 100755 munjong/bin/detect_sejong_period_error.py create mode 100755 munjong/bin/fix_final_symbol_error.py create mode 100755 munjong/bin/recover_english_case.py create mode 100755 munjong/bin/recover_raw_morph_mismatch.py create mode 100755 munjong/bin/recover_wide_quotation.py create mode 100755 munjong/bin/remove_sejong_period_error.py diff --git a/munjong/bin/convert_jamo_to_compat.py b/munjong/bin/convert_jamo_to_compat.py new file mode 100755 index 0000000..6ea1426 --- /dev/null +++ b/munjong/bin/convert_jamo_to_compat.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +""" +한글 자모 영역의 코드를 호환 영역으로 변환 +__author__ = 'Jamie (jamie.lim@kakaocorp.com)' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' +""" + + +########### +# imports # +########### +from argparse import ArgumentParser +import logging +import sys + +from khaiii.munjong.sejong_corpus import WORD_ID_PTN +from khaiii.resource.jaso import norm_compat + + +############# +# functions # +############# +def _norm(text: str) -> str: + """ + 정규화를 수행하는 함수 + Args: + text: 입력 텍스트 + Returns: + 정규화된 텍스트 + """ + normalized = norm_compat(text) + normalized = normalized.replace('ᆞ', 'ㆍ') # 0x119e -> 0x318d + normalized = normalized.replace('ᄝ', 'ㅱ') # 0x111d -> 0x3171 + return normalized + + +def run(): + """ + run function which is the start point of program + """ + for line in sys.stdin: + line = line.rstrip('\r\n') + if not WORD_ID_PTN.match(line): + print(line) + continue + wid, word, morph = line.split('\t') + print('{}\t{}\t{}'.format(wid, _norm(word), _norm(morph))) + + +######## +# main # +######## +def main(): + """ + main function processes only argument parsing + """ + parser = ArgumentParser(description='한글 자모 영역의 코드를 호환 영역으로 변환') + parser.add_argument('--input', help='input file ', metavar='FILE') + parser.add_argument('--output', help='output file ', metavar='FILE') + parser.add_argument('--debug', help='enable debug', action='store_true') + args = parser.parse_args() + + if args.input: + sys.stdin = open(args.input, 'r', encoding='UTF-8') + if args.output: + sys.stdout = open(args.output, 'w', encoding='UTF-8') + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + run() + + +if __name__ == '__main__': + main() diff --git a/munjong/bin/detect_sejong_period_error.py b/munjong/bin/detect_sejong_period_error.py new file mode 100755 index 0000000..7dfd4fb --- /dev/null +++ b/munjong/bin/detect_sejong_period_error.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +""" +detect period error of Sejong corpus +__author__ = 'Jamie (jamie.lim@kakaocorp.com)' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' +""" + + +########### +# imports # +########### +from argparse import ArgumentParser +import logging +import os +import re +import sys +from typing import Iterator, TextIO, Tuple + +from khaiii.munjong.sejong_corpus import Morph, WORD_ID_PTN + + +############# +# functions # +############# +def _get_two_lines(fin: TextIO) -> Iterator[Tuple[str, str]]: + """ + get two lines tuple from file (generator) + Args: + fin: input file + Yields: + current line + next line + """ + curr_line = fin.readline().rstrip('\r\n') + for next_line in fin: + next_line = next_line.rstrip('\r\n') + yield curr_line, next_line + curr_line = next_line + + +def _is_correct_eos(line: str) -> bool: + """ + whether correct end of sentence or not + Args: + line: line (word) + Returns: + whether correct or not + """ + _, _, morphs_str = line.split('\t') + if re.match(r'.+/EF \+ ./SF$', morphs_str): + return True + if re.match(r'.+/SF \+ [\'"’”」\]]/SS$', morphs_str): + return True + morphs = [Morph.parse(_) for _ in morphs_str.split(' + ')] + tags_str = '+'.join([_.tag for _ in morphs]) + if tags_str.endswith('+SF+SS+JKQ') or tags_str.endswith('+SF+SS+VCP+ETM'): + return True + return False + + +def run(): + """ + run function which is the start point of program + """ + file_name = os.path.basename(sys.stdin.name) + for line_num, (curr_line, next_line) in enumerate(_get_two_lines(sys.stdin), start=1): + cols = curr_line.split('\t') + if len(cols) != 3 or not WORD_ID_PTN.match(cols[0]): + continue + if '/SF + ' not in cols[2] or not next_line.startswith('', metavar='FILE') + parser.add_argument('--output', help='output file ', metavar='FILE') + parser.add_argument('--debug', help='enable debug', action='store_true') + args = parser.parse_args() + + if args.input: + sys.stdin = open(args.input, 'rt') + if args.output: + sys.stdout = open(args.output, 'wt') + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + run() + + +if __name__ == '__main__': + main() diff --git a/munjong/bin/fix_final_symbol_error.py b/munjong/bin/fix_final_symbol_error.py new file mode 100755 index 0000000..5ba1644 --- /dev/null +++ b/munjong/bin/fix_final_symbol_error.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +""" +fix final symbol errors on Sejong corpus +__author__ = 'Jamie (jamie.lim@kakaocorp.com)' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' +""" + + +########### +# imports # +########### +from argparse import ArgumentParser +import logging +import os +import sys + +from khaiii.munjong.sejong_corpus import Morph, Word, WORD_ID_PTN + + +############# +# functions # +############# +def _attach_missing_symbol(word: Word): + """ + attach missing symbol + Args: + word: Word object + """ + raw_word = word.raw + raw_morph = ''.join([_.lex for _ in word.morphs]) + if not raw_word.startswith(raw_morph) or len(raw_word) != len(raw_morph)+1: + return + last_symbol = raw_word[-1] + if last_symbol == '.' and word.morphs[-1].tag == 'EC': + word.morphs.append(Morph('.', 'SF')) + elif last_symbol == ',': + word.morphs.append(Morph(',', 'SP')) + elif last_symbol == '"': + word.morphs.append(Morph('"', 'SS')) + + +def run(): + """ + run function which is the start point of program + """ + file_name = os.path.basename(sys.stdin.name) + for line_num, line in enumerate(sys.stdin, start=1): + line = line.rstrip('\r\n') + if not WORD_ID_PTN.match(line): + print(line) + continue + word = Word.parse(line, file_name, line_num) + _attach_missing_symbol(word) + print(word) + + +######## +# main # +######## +def main(): + """ + main function processes only argument parsing + """ + parser = ArgumentParser(description='fix final symbol errors on Sejong corpus') + parser.add_argument('--input', help='input file ', metavar='FILE') + parser.add_argument('--output', help='output file ', metavar='FILE') + parser.add_argument('--debug', help='enable debug', action='store_true') + args = parser.parse_args() + + if args.input: + sys.stdin = open(args.input, 'rt') + if args.output: + sys.stdout = open(args.output, 'wt') + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + run() + + +if __name__ == '__main__': + main() diff --git a/munjong/bin/recover_english_case.py b/munjong/bin/recover_english_case.py new file mode 100755 index 0000000..5a0dd9d --- /dev/null +++ b/munjong/bin/recover_english_case.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +""" +recover cases of English letters in Sejong corpus +__author__ = 'Jamie (jamie.lim@kakaocorp.com)' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' +""" + + +########### +# imports # +########### +from argparse import ArgumentParser +import copy +import logging +import os +import re +import sys + +from khaiii.munjong.sejong_corpus import Word, WORD_ID_PTN + + +############# +# functions # +############# +def _recover(word: Word): + """ + recover cases + Args: + word: Word object + """ + word_letters = [_ for _ in word.raw if re.match(r'[a-zA-Z]', _)] + letter_idx = -1 + is_recovered = False + word_copy = copy.deepcopy(word) + for morph in word_copy.morphs: + for idx, char in enumerate(morph.lex): + if not re.match(r'[a-zA-Z]', char): + continue + letter_idx += 1 + if word_letters[letter_idx] == char: + continue + morph.lex = morph.lex[:idx] + word_letters[letter_idx] + morph.lex[idx+1:] + is_recovered = True + if is_recovered: + logging.info('%s => %s', str(word), word_copy.morph_str()) + word.morphs = word_copy.morphs + + +def run(): + """ + run function which is the start point of program + """ + file_name = os.path.basename(sys.stdin.name) + for line_num, line in enumerate(sys.stdin, start=1): + line = line.rstrip('\r\n') + if not WORD_ID_PTN.match(line): + print(line) + continue + word = Word.parse(line, file_name, line_num) + try: + _recover(word) + except IndexError as idx_err: + logging.error('%s(%d): %s: %s', file_name, line_num, idx_err, word) + print(word) + + +######## +# main # +######## +def main(): + """ + main function processes only argument parsing + """ + parser = ArgumentParser(description='recover cases of English letters in Sejong corpus') + parser.add_argument('--input', help='input file ', metavar='FILE') + parser.add_argument('--output', help='output file ', metavar='FILE') + parser.add_argument('--debug', help='enable debug', action='store_true') + args = parser.parse_args() + + if args.input: + sys.stdin = open(args.input, 'r', encoding='UTF-8') + if args.output: + sys.stdout = open(args.output, 'w', encoding='UTF-8') + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + run() + + +if __name__ == '__main__': + main() diff --git a/munjong/bin/recover_raw_morph_mismatch.py b/munjong/bin/recover_raw_morph_mismatch.py new file mode 100755 index 0000000..2bcb136 --- /dev/null +++ b/munjong/bin/recover_raw_morph_mismatch.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +""" +어절의 원문과 형태소 분석 결과의 문자가 정규화하면 같지만 코드가 다른 경우 원문의 문자로 복원 +__author__ = 'Jamie (jamie.lim@kakaocorp.com)' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' +""" + + +########### +# imports # +########### +from argparse import ArgumentParser +import logging +import os +import sys + +from khaiii.munjong.sejong_corpus import Morph, ParseError, Word, WORD_ID_PTN + + +############# +# functions # +############# +def _recover(line: str) -> str: + """ + 문자를 복원한다. + Args: + line: 어절 라인 + Returns: + 복원된 라인 + """ + wid, raw, morphs_str = line.split('\t') + raw_idx = 0 + morphs = [] + for token_str in morphs_str.split(' + '): + morph = Morph.parse(token_str) + lex = [] + for _ in range(len(morph.lex)): + try: + lex.append(raw[raw_idx]) + raw_idx += 1 + except IndexError as idx_err: + logging.error(line) + raise idx_err + morph.lex = ''.join(lex) + morphs.append(morph) + morphs_new = ' + '.join([str(m) for m in morphs]) + logging.debug('%s\t%s\t%s => %s', wid, raw, morphs_str, morphs_new) + return '{}\t{}\t{}'.format(wid, raw, morphs_new) + + +def run(): + """ + run function which is the start point of program + """ + file_name = os.path.basename(sys.stdin.name) + for line_num, line in enumerate(sys.stdin, start=1): + line = line.rstrip('\r\n') + if not WORD_ID_PTN.match(line): + print(line) + continue + try: + Word.parse(line, file_name, line_num) + except ParseError as par_err: + if 'raw-morph mismatch' in str(par_err): + line = _recover(line) + else: + raise par_err + print(line) + + +######## +# main # +######## +def main(): + """ + main function processes only argument parsing + """ + parser = ArgumentParser(description='어절의 원문과 형태소 분석 결과의 문자가 정규화하면 같지만 코드가 다른 경우' + ' 원문의 문자로 복원') + parser.add_argument('--input', help='input file ', metavar='FILE') + parser.add_argument('--output', help='output file ', metavar='FILE') + parser.add_argument('--debug', help='enable debug', action='store_true') + args = parser.parse_args() + + if args.input: + sys.stdin = open(args.input, 'r', encoding='UTF-8') + if args.output: + sys.stdout = open(args.output, 'w', encoding='UTF-8') + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + run() + + +if __name__ == '__main__': + main() diff --git a/munjong/bin/recover_wide_quotation.py b/munjong/bin/recover_wide_quotation.py new file mode 100755 index 0000000..0ee0792 --- /dev/null +++ b/munjong/bin/recover_wide_quotation.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +""" +recover wide char quotations in Sejong corpus +__author__ = 'Jamie (jamie.lim@kakaocorp.com)' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' +""" + + +########### +# imports # +########### +from argparse import ArgumentParser +import logging +import os +import sys + +from khaiii.munjong.sejong_corpus import Word, WORD_ID_PTN + + +############# +# constants # +############# +_QUOT_NORM = { + '"': '"', + '“': '"', + '”': '"', + "'": "'", + "‘": "'", + "’": "'", + "`": "'", +} + + +############# +# functions # +############# +def _recover(word: Word): + """ + recover wide char quotations + Args: + word: Word object + """ + word_quots = [_ for _ in word.raw if _ in _QUOT_NORM] + morph_quots = [] + for idx, morph in enumerate(word.morphs): + if morph.tag != 'SS' or morph.lex not in _QUOT_NORM: + continue + morph_quots.append((idx, morph)) + quot_idx = len(morph_quots)-1 + if len(word_quots) <= quot_idx or _QUOT_NORM[word_quots[quot_idx]] != _QUOT_NORM[morph.lex]: + logging.error('%d-th quots are different: %s', quot_idx+1, word) + return + if len(word_quots) != len(morph_quots): + morph_quots = [_ for _ in word.morph_str() if _ in _QUOT_NORM] + if word_quots != morph_quots: + logging.error('number of quots are different: %s', word) + return + for word_char, (idx, morph) in zip(word_quots, morph_quots): + if word_char == morph.lex: + continue + morph.lex = word_char + + +def run(): + """ + run function which is the start point of program + """ + file_name = os.path.basename(sys.stdin.name) + for line_num, line in enumerate(sys.stdin, start=1): + line = line.rstrip('\r\n') + if not WORD_ID_PTN.match(line): + print(line) + continue + word = Word.parse(line, file_name, line_num) + _recover(word) + print(word) + + +######## +# main # +######## +def main(): + """ + main function processes only argument parsing + """ + parser = ArgumentParser(description='recover wide char quotations in Sejong corpus') + parser.add_argument('--input', help='input file ', metavar='FILE') + parser.add_argument('--output', help='output file ', metavar='FILE') + parser.add_argument('--debug', help='enable debug', action='store_true') + args = parser.parse_args() + + if args.input: + sys.stdin = open(args.input, 'rt') + if args.output: + sys.stdout = open(args.output, 'wt') + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + run() + + +if __name__ == '__main__': + main() diff --git a/munjong/bin/remove_sejong_period_error.py b/munjong/bin/remove_sejong_period_error.py new file mode 100755 index 0000000..59ffd86 --- /dev/null +++ b/munjong/bin/remove_sejong_period_error.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +""" +remove wrong sentence breaking marks after period error eojeol +__author__ = 'Jamie (jamie.lim@kakaocorp.com)' +__copyright__ = 'Copyright (C) 2017-, Kakao Corp. All rights reserved.' +""" + + +########### +# imports # +########### +from argparse import ArgumentParser +import logging +import os +import re +import sys +from typing import TextIO, Tuple + +from khaiii.munjong.sejong_corpus import Morph, WORD_ID_PTN + + +############# +# functions # +############# +def _get_three_lines(fin: TextIO) -> Tuple[str, str, str]: + """ + get three lines tuple from file (generator) + Args: + fin: input file + Yields: + prev. prev. line + prev. line + curr. line + """ + prev_prev_line = fin.readline().rstrip('\r\n') + prev_line = fin.readline().rstrip('\r\n') + # print first two lines + print(prev_prev_line) + print(prev_line) + for curr_line in fin: + curr_line = curr_line.rstrip('\r\n') + yield prev_prev_line, prev_line, curr_line + prev_prev_line = prev_line + prev_line = curr_line + + +def _is_known_period_error_eojeol(line: str) -> bool: + """ + 알려진 특정 문장분리 오류를 포함하는 어절인 지 여부 + Args: + line: line (eojeol) + Returns: + whether has error or not + """ + cols = line.split('\t') + if len(cols) != 3 or not WORD_ID_PTN.match(cols[0]): + return False + if '/SF + ' not in cols[2] or re.match(r'.+/EF \+ ./SF$', cols[2]): + return False + if re.match(r'.+/SF \+ [\'"’”]/SS$', cols[2]): + return False + morphs = [Morph.parse(_) for _ in cols[2].split(' + ')] + tags_str = '+'.join([_.tag for _ in morphs]) + if 'SN+SF+SN' in tags_str and not tags_str.endswith('+SF'): + # 4.6판: 4/SN + ./SF + 6/SN + 판/NNB + if 'XSN+SF+SN' not in tags_str: + return True + elif 'SL+SF+SL' in tags_str and not tags_str.endswith('+SF'): + # S.M.오너: S/SL + ./SF + M/SL + ./SF + 오너/NNG + return True + return False + + +def run(): + """ + run function which is the start point of program + """ + file_name = os.path.basename(sys.stdin.name) + for line_num, (prev_prev_line, prev_line, curr_line) in enumerate(_get_three_lines(sys.stdin), + start=1): + if curr_line == '

' and _is_known_period_error_eojeol(prev_line): + continue + elif prev_line == '

' and curr_line == '

' and \ + _is_known_period_error_eojeol(prev_prev_line): + logging.info('%s:%d\t%s', file_name, line_num, prev_prev_line) + continue + print(curr_line) + + +######## +# main # +######## +def main(): + """ + main function processes only argument parsing + """ + parser = ArgumentParser(description='remove wrong sentence breaking marks after' + ' period error eojeol') + parser.add_argument('--input', help='input file ', metavar='FILE') + parser.add_argument('--output', help='output file ', metavar='FILE') + parser.add_argument('--debug', help='enable debug', action='store_true') + args = parser.parse_args() + + if args.input: + sys.stdin = open(args.input, 'rt') + if args.output: + sys.stdout = open(args.output, 'wt') + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + run() + + +if __name__ == '__main__': + main() diff --git a/src/main/python/khaiii/resource/jaso.py b/src/main/python/khaiii/resource/jaso.py index 1f5e184..7b667b8 100644 --- a/src/main/python/khaiii/resource/jaso.py +++ b/src/main/python/khaiii/resource/jaso.py @@ -17,21 +17,63 @@ ############# # constants # ############# -_FIRST = ['\u3131', '\u3132', '\u3134', '\u3137', '\u3138', # 초성 - '\u3139', '\u3141', '\u3142', '\u3143', '\u3145', - '\u3146', '\u3147', '\u3148', '\u3149', '\u314a', - '\u314b', '\u314c', '\u314d', '\u314e'] -_MIDDLE = ['\u314f', '\u3150', '\u3151', '\u3152', '\u3153', # 중성 - '\u3154', '\u3155', '\u3156', '\u3157', '\u3158', - '\u3159', '\u315a', '\u315b', '\u315c', '\u315d', - '\u315e', '\u315f', '\u3160', '\u3161', '\u3162', - '\u3163'] -_LAST = ['\u3131', '\u3132', '\u3133', '\u3134', '\u3135', # 종성 - '\u3136', '\u3137', '\u3139', '\u313a', '\u313b', - '\u313c', '\u313d', '\u313e', '\u313f', '\u3140', - '\u3141', '\u3142', '\u3144', '\u3145', '\u3146', - '\u3147', '\u3148', '\u314a', '\u314b', '\u314c', - '\u314d', '\u314e'] +# 한글 자모 호환 영역 (초성과 종성이 같음. 두벌식 키보드로 입력할 때 들어가는 코드) +_FIRST_COMPAT = ['\u3131', '\u3132', '\u3134', '\u3137', '\u3138', # 초성 + '\u3139', '\u3141', '\u3142', '\u3143', '\u3145', + '\u3146', '\u3147', '\u3148', '\u3149', '\u314a', + '\u314b', '\u314c', '\u314d', '\u314e'] +_MIDDLE_COMPAT = ['\u314f', '\u3150', '\u3151', '\u3152', '\u3153', # 중성 + '\u3154', '\u3155', '\u3156', '\u3157', '\u3158', + '\u3159', '\u315a', '\u315b', '\u315c', '\u315d', + '\u315e', '\u315f', '\u3160', '\u3161', '\u3162', + '\u3163'] +_LAST_COMPAT = ['\u3131', '\u3132', '\u3133', '\u3134', '\u3135', # 종성 + '\u3136', '\u3137', '\u3139', '\u313a', '\u313b', + '\u313c', '\u313d', '\u313e', '\u313f', '\u3140', + '\u3141', '\u3142', '\u3144', '\u3145', '\u3146', + '\u3147', '\u3148', '\u314a', '\u314b', '\u314c', + '\u314d', '\u314e'] +_ALL_COMPAT = _FIRST_COMPAT + _MIDDLE_COMPAT + _LAST_COMPAT + +# 한글 자모 영역 (초성과 종성이 다름. 세종 코퍼스에서 사용한 코드) +_FIRST_JAMO = ['\u1100', '\u1101', '\u1102', '\u1103', '\u1104', # 초성 + '\u1105', '\u1106', '\u1107', '\u1108', '\u1109', + '\u110a', '\u110b', '\u110c', '\u110d', '\u110e', + '\u110f', '\u1110', '\u1111', '\u1112'] +_MIDDLE_JAMO = ['\u1161', '\u1162', '\u1163', '\u1164', '\u1165', # 중성 + '\u1166', '\u1167', '\u1168', '\u1169', '\u116a', + '\u116b', '\u116c', '\u116d', '\u116e', '\u116f', + '\u1170', '\u1171', '\u1172', '\u1173', '\u1174', + '\u1175'] +_LAST_JAMO = ['\u11a8', '\u11a9', '\u11aa', '\u11ab', '\u11ac', # 종성 + '\u11ad', '\u11ae', '\u11af', '\u11b0', '\u11b1', + '\u11b2', '\u11b3', '\u11b4', '\u11b5', '\u11b6', + '\u11b7', '\u11b8', '\u11b9', '\u11ba', '\u11bb', + '\u11bc', '\u11bd', '\u11be', '\u11bf', '\u11c0', + '\u11c1', '\u11c2'] +_ALL_JAMO = _FIRST_JAMO + _MIDDLE_JAMO + _LAST_JAMO +_ALL_JAMO_SET = set(_ALL_JAMO) +_JAMO_TO_COMPAT = dict(zip(_ALL_JAMO, _ALL_COMPAT)) + +# 반각 자모 영역 (호환 영역과 비슷하게 초성과 종성이 같으나 글자 폭이 절반인 코드) +_FIRST_HALFWIDTH = ['\uffa1', '\uffa2', '\uffa4', '\uffa7', '\uffa8', # 초성 + '\uffa9', '\uffb1', '\uffb2', '\uffb3', '\uffb5', + '\uffb6', '\uffb7', '\uffb8', '\uffb9', '\uffba', + '\uffbb', '\uffbc', '\uffbd', '\uffbe'] +_MIDDLE_HALFWIDTH = ['\uffc2', '\uffc3', '\uffc4', '\uffc5', '\uffc6', # 중성 + '\uffc7', '\uffca', '\uffcb', '\uffcc', '\uffcd', + '\uffce', '\uffcf', '\uffd2', '\uffd3', '\uffd4', + '\uffd5', '\uffd6', '\uffd7', '\uffda', '\uffdb', + '\uffdc'] +_LAST_HALFWIDTH = ['\uffa1', '\uffa2', '\uffa3', '\uffa4', '\uffa5', # 종성 + '\uffa6', '\uffa7', '\uffa9', '\uffaa', '\uffab', + '\uffac', '\uffad', '\uffae', '\uffaf', '\uffb0', + '\uffb1', '\uffb2', '\uffb4', '\uffb5', '\uffb6', + '\uffb7', '\uffb8', '\uffba', '\uffbb', '\uffbc', + '\uffbd', '\uffbe'] +_ALL_HALFWIDTH = _FIRST_HALFWIDTH + _MIDDLE_HALFWIDTH + _LAST_HALFWIDTH +_ALL_HALFWIDTH_SET = set(_ALL_HALFWIDTH) +_HALFWIDTH_TO_COMPAT = dict(zip(_ALL_HALFWIDTH, _ALL_COMPAT)) ############# @@ -54,12 +96,12 @@ def _decomp_char(char: str) -> Tuple[str, str, str]: first_idx = first_start // 21 middle_idx = first_start % 21 - first = _FIRST[first_idx] - middle = _MIDDLE[middle_idx] + first = _FIRST_COMPAT[first_idx] + middle = _MIDDLE_COMPAT[middle_idx] if not last_idx: return first, middle - last = _LAST[last_idx-1] + last = _LAST_COMPAT[last_idx-1] return first, middle, last @@ -71,6 +113,9 @@ def decompose(text: str) -> str: Returns: 자소 분해된 텍스트 """ + if not text: + return text + decomposed = [] for char in text: code = ord(char) @@ -79,3 +124,25 @@ def decompose(text: str) -> str: else: decomposed.extend(_decomp_char(char)) return ''.join(decomposed) + + +def norm_compat(text: str) -> str: + """ + 유니코드 내 한글 자소를 호환 영역으로 정규화한다. + Args: + text: 한글 텍스트 + Returns: + 자소가 호환 영역으로 정규화된 텍스트 + """ + if not text: + return text + + normalized = [] + for char in text: + if char in _ALL_JAMO_SET: + normalized.append(_JAMO_TO_COMPAT[char]) + elif char in _ALL_HALFWIDTH_SET: + normalized.append(_HALFWIDTH_TO_COMPAT[char]) + else: + normalized.append(char) + return ''.join(normalized)