From 767d05734b845d89ed51e9d605d20217116842e4 Mon Sep 17 00:00:00 2001
From: Jamie
Date: Mon, 28 Jan 2019 20:21:51 +0900
Subject: [PATCH] =?UTF-8?q?=EC=84=B8=EC=A2=85=20=EC=BD=94=ED=8D=BC?=
=?UTF-8?q?=EC=8A=A4=EC=9D=98=20=EC=98=A4=EB=A5=98=EC=99=80=20=EA=B4=80?=
=?UTF-8?q?=EB=A0=A8=EB=90=9C=20=EA=B0=81=EC=A2=85=20=EC=8A=A4=ED=81=AC?=
=?UTF-8?q?=EB=A6=BD=ED=8A=B8=20=EC=B6=94=EA=B0=80=20#3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
munjong/bin/convert_jamo_to_compat.py | 79 ++++++++++++++
munjong/bin/detect_sejong_period_error.py | 106 +++++++++++++++++++
munjong/bin/fix_final_symbol_error.py | 86 ++++++++++++++++
munjong/bin/recover_english_case.py | 96 +++++++++++++++++
munjong/bin/recover_raw_morph_mismatch.py | 101 ++++++++++++++++++
munjong/bin/recover_wide_quotation.py | 108 +++++++++++++++++++
munjong/bin/remove_sejong_period_error.py | 120 ++++++++++++++++++++++
src/main/python/khaiii/resource/jaso.py | 103 +++++++++++++++----
8 files changed, 781 insertions(+), 18 deletions(-)
create mode 100755 munjong/bin/convert_jamo_to_compat.py
create mode 100755 munjong/bin/detect_sejong_period_error.py
create mode 100755 munjong/bin/fix_final_symbol_error.py
create mode 100755 munjong/bin/recover_english_case.py
create mode 100755 munjong/bin/recover_raw_morph_mismatch.py
create mode 100755 munjong/bin/recover_wide_quotation.py
create mode 100755 munjong/bin/remove_sejong_period_error.py
diff --git a/munjong/bin/convert_jamo_to_compat.py b/munjong/bin/convert_jamo_to_compat.py
new file mode 100755
index 0000000..6ea1426
--- /dev/null
+++ b/munjong/bin/convert_jamo_to_compat.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+한글 자모 영역의 코드를 호환 영역으로 변환
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import logging
+import sys
+
+from khaiii.munjong.sejong_corpus import WORD_ID_PTN
+from khaiii.resource.jaso import norm_compat
+
+
+#############
+# functions #
+#############
+def _norm(text: str) -> str:
+ """
+ 정규화를 수행하는 함수
+ Args:
+ text: 입력 텍스트
+ Returns:
+ 정규화된 텍스트
+ """
+ normalized = norm_compat(text)
+ normalized = normalized.replace('ᆞ', 'ㆍ') # 0x119e -> 0x318d
+ normalized = normalized.replace('ᄝ', 'ㅱ') # 0x111d -> 0x3171
+ return normalized
+
+
+def run():
+ """
+ run function which is the start point of program
+ """
+ for line in sys.stdin:
+ line = line.rstrip('\r\n')
+ if not WORD_ID_PTN.match(line):
+ print(line)
+ continue
+ wid, word, morph = line.split('\t')
+ print('{}\t{}\t{}'.format(wid, _norm(word), _norm(morph)))
+
+
+########
+# main #
+########
+def main():
+ """
+ main function processes only argument parsing
+ """
+ parser = ArgumentParser(description='한글 자모 영역의 코드를 호환 영역으로 변환')
+ parser.add_argument('--input', help='input file ', metavar='FILE')
+ parser.add_argument('--output', help='output file ', metavar='FILE')
+ parser.add_argument('--debug', help='enable debug', action='store_true')
+ args = parser.parse_args()
+
+ if args.input:
+ sys.stdin = open(args.input, 'r', encoding='UTF-8')
+ if args.output:
+ sys.stdout = open(args.output, 'w', encoding='UTF-8')
+ if args.debug:
+ logging.basicConfig(level=logging.DEBUG)
+ else:
+ logging.basicConfig(level=logging.INFO)
+
+ run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/munjong/bin/detect_sejong_period_error.py b/munjong/bin/detect_sejong_period_error.py
new file mode 100755
index 0000000..7dfd4fb
--- /dev/null
+++ b/munjong/bin/detect_sejong_period_error.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+detect period error of Sejong corpus
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import logging
+import os
+import re
+import sys
+from typing import Iterator, TextIO, Tuple
+
+from khaiii.munjong.sejong_corpus import Morph, WORD_ID_PTN
+
+
+#############
+# functions #
+#############
+def _get_two_lines(fin: TextIO) -> Iterator[Tuple[str, str]]:
+ """
+ get two lines tuple from file (generator)
+ Args:
+ fin: input file
+ Yields:
+ current line
+ next line
+ """
+ curr_line = fin.readline().rstrip('\r\n')
+ for next_line in fin:
+ next_line = next_line.rstrip('\r\n')
+ yield curr_line, next_line
+ curr_line = next_line
+
+
+def _is_correct_eos(line: str) -> bool:
+ """
+ whether correct end of sentence or not
+ Args:
+ line: line (word)
+ Returns:
+ whether correct or not
+ """
+ _, _, morphs_str = line.split('\t')
+ if re.match(r'.+/EF \+ ./SF$', morphs_str):
+ return True
+ if re.match(r'.+/SF \+ [\'"’”」\]]/SS$', morphs_str):
+ return True
+ morphs = [Morph.parse(_) for _ in morphs_str.split(' + ')]
+ tags_str = '+'.join([_.tag for _ in morphs])
+ if tags_str.endswith('+SF+SS+JKQ') or tags_str.endswith('+SF+SS+VCP+ETM'):
+ return True
+ return False
+
+
+def run():
+ """
+ run function which is the start point of program
+ """
+ file_name = os.path.basename(sys.stdin.name)
+ for line_num, (curr_line, next_line) in enumerate(_get_two_lines(sys.stdin), start=1):
+ cols = curr_line.split('\t')
+ if len(cols) != 3 or not WORD_ID_PTN.match(cols[0]):
+ continue
+ if '/SF + ' not in cols[2] or not next_line.startswith(''):
+ continue
+ if _is_correct_eos(curr_line):
+ continue
+ print('{}:{}\t{}'.format(file_name, line_num, curr_line))
+
+
+########
+# main #
+########
+def main():
+ """
+ main function processes only argument parsing
+ """
+ parser = ArgumentParser(description='detect period error of Sejong corpus')
+ parser.add_argument('--input', help='input file ', metavar='FILE')
+ parser.add_argument('--output', help='output file ', metavar='FILE')
+ parser.add_argument('--debug', help='enable debug', action='store_true')
+ args = parser.parse_args()
+
+ if args.input:
+ sys.stdin = open(args.input, 'rt')
+ if args.output:
+ sys.stdout = open(args.output, 'wt')
+ if args.debug:
+ logging.basicConfig(level=logging.DEBUG)
+ else:
+ logging.basicConfig(level=logging.INFO)
+
+ run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/munjong/bin/fix_final_symbol_error.py b/munjong/bin/fix_final_symbol_error.py
new file mode 100755
index 0000000..5ba1644
--- /dev/null
+++ b/munjong/bin/fix_final_symbol_error.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+fix final symbol errors on Sejong corpus
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import logging
+import os
+import sys
+
+from khaiii.munjong.sejong_corpus import Morph, Word, WORD_ID_PTN
+
+
+#############
+# functions #
+#############
+def _attach_missing_symbol(word: Word):
+ """
+ attach missing symbol
+ Args:
+ word: Word object
+ """
+ raw_word = word.raw
+ raw_morph = ''.join([_.lex for _ in word.morphs])
+ if not raw_word.startswith(raw_morph) or len(raw_word) != len(raw_morph)+1:
+ return
+ last_symbol = raw_word[-1]
+ if last_symbol == '.' and word.morphs[-1].tag == 'EC':
+ word.morphs.append(Morph('.', 'SF'))
+ elif last_symbol == ',':
+ word.morphs.append(Morph(',', 'SP'))
+ elif last_symbol == '"':
+ word.morphs.append(Morph('"', 'SS'))
+
+
+def run():
+ """
+ run function which is the start point of program
+ """
+ file_name = os.path.basename(sys.stdin.name)
+ for line_num, line in enumerate(sys.stdin, start=1):
+ line = line.rstrip('\r\n')
+ if not WORD_ID_PTN.match(line):
+ print(line)
+ continue
+ word = Word.parse(line, file_name, line_num)
+ _attach_missing_symbol(word)
+ print(word)
+
+
+########
+# main #
+########
+def main():
+ """
+ main function processes only argument parsing
+ """
+ parser = ArgumentParser(description='fix final symbol errors on Sejong corpus')
+ parser.add_argument('--input', help='input file ', metavar='FILE')
+ parser.add_argument('--output', help='output file ', metavar='FILE')
+ parser.add_argument('--debug', help='enable debug', action='store_true')
+ args = parser.parse_args()
+
+ if args.input:
+ sys.stdin = open(args.input, 'rt')
+ if args.output:
+ sys.stdout = open(args.output, 'wt')
+ if args.debug:
+ logging.basicConfig(level=logging.DEBUG)
+ else:
+ logging.basicConfig(level=logging.INFO)
+
+ run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/munjong/bin/recover_english_case.py b/munjong/bin/recover_english_case.py
new file mode 100755
index 0000000..5a0dd9d
--- /dev/null
+++ b/munjong/bin/recover_english_case.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+recover cases of English letters in Sejong corpus
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import copy
+import logging
+import os
+import re
+import sys
+
+from khaiii.munjong.sejong_corpus import Word, WORD_ID_PTN
+
+
+#############
+# functions #
+#############
+def _recover(word: Word):
+ """
+ recover cases
+ Args:
+ word: Word object
+ """
+ word_letters = [_ for _ in word.raw if re.match(r'[a-zA-Z]', _)]
+ letter_idx = -1
+ is_recovered = False
+ word_copy = copy.deepcopy(word)
+ for morph in word_copy.morphs:
+ for idx, char in enumerate(morph.lex):
+ if not re.match(r'[a-zA-Z]', char):
+ continue
+ letter_idx += 1
+ if word_letters[letter_idx] == char:
+ continue
+ morph.lex = morph.lex[:idx] + word_letters[letter_idx] + morph.lex[idx+1:]
+ is_recovered = True
+ if is_recovered:
+ logging.info('%s => %s', str(word), word_copy.morph_str())
+ word.morphs = word_copy.morphs
+
+
+def run():
+ """
+ run function which is the start point of program
+ """
+ file_name = os.path.basename(sys.stdin.name)
+ for line_num, line in enumerate(sys.stdin, start=1):
+ line = line.rstrip('\r\n')
+ if not WORD_ID_PTN.match(line):
+ print(line)
+ continue
+ word = Word.parse(line, file_name, line_num)
+ try:
+ _recover(word)
+ except IndexError as idx_err:
+ logging.error('%s(%d): %s: %s', file_name, line_num, idx_err, word)
+ print(word)
+
+
+########
+# main #
+########
+def main():
+ """
+ main function processes only argument parsing
+ """
+ parser = ArgumentParser(description='recover cases of English letters in Sejong corpus')
+ parser.add_argument('--input', help='input file ', metavar='FILE')
+ parser.add_argument('--output', help='output file ', metavar='FILE')
+ parser.add_argument('--debug', help='enable debug', action='store_true')
+ args = parser.parse_args()
+
+ if args.input:
+ sys.stdin = open(args.input, 'r', encoding='UTF-8')
+ if args.output:
+ sys.stdout = open(args.output, 'w', encoding='UTF-8')
+ if args.debug:
+ logging.basicConfig(level=logging.DEBUG)
+ else:
+ logging.basicConfig(level=logging.INFO)
+
+ run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/munjong/bin/recover_raw_morph_mismatch.py b/munjong/bin/recover_raw_morph_mismatch.py
new file mode 100755
index 0000000..2bcb136
--- /dev/null
+++ b/munjong/bin/recover_raw_morph_mismatch.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+어절의 원문과 형태소 분석 결과의 문자가 정규화하면 같지만 코드가 다른 경우 원문의 문자로 복원
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import logging
+import os
+import sys
+
+from khaiii.munjong.sejong_corpus import Morph, ParseError, Word, WORD_ID_PTN
+
+
+#############
+# functions #
+#############
+def _recover(line: str) -> str:
+ """
+ 문자를 복원한다.
+ Args:
+ line: 어절 라인
+ Returns:
+ 복원된 라인
+ """
+ wid, raw, morphs_str = line.split('\t')
+ raw_idx = 0
+ morphs = []
+ for token_str in morphs_str.split(' + '):
+ morph = Morph.parse(token_str)
+ lex = []
+ for _ in range(len(morph.lex)):
+ try:
+ lex.append(raw[raw_idx])
+ raw_idx += 1
+ except IndexError as idx_err:
+ logging.error(line)
+ raise idx_err
+ morph.lex = ''.join(lex)
+ morphs.append(morph)
+ morphs_new = ' + '.join([str(m) for m in morphs])
+ logging.debug('%s\t%s\t%s => %s', wid, raw, morphs_str, morphs_new)
+ return '{}\t{}\t{}'.format(wid, raw, morphs_new)
+
+
+def run():
+ """
+ run function which is the start point of program
+ """
+ file_name = os.path.basename(sys.stdin.name)
+ for line_num, line in enumerate(sys.stdin, start=1):
+ line = line.rstrip('\r\n')
+ if not WORD_ID_PTN.match(line):
+ print(line)
+ continue
+ try:
+ Word.parse(line, file_name, line_num)
+ except ParseError as par_err:
+ if 'raw-morph mismatch' in str(par_err):
+ line = _recover(line)
+ else:
+ raise par_err
+ print(line)
+
+
+########
+# main #
+########
+def main():
+ """
+ main function processes only argument parsing
+ """
+ parser = ArgumentParser(description='어절의 원문과 형태소 분석 결과의 문자가 정규화하면 같지만 코드가 다른 경우'
+ ' 원문의 문자로 복원')
+ parser.add_argument('--input', help='input file ', metavar='FILE')
+ parser.add_argument('--output', help='output file ', metavar='FILE')
+ parser.add_argument('--debug', help='enable debug', action='store_true')
+ args = parser.parse_args()
+
+ if args.input:
+ sys.stdin = open(args.input, 'r', encoding='UTF-8')
+ if args.output:
+ sys.stdout = open(args.output, 'w', encoding='UTF-8')
+ if args.debug:
+ logging.basicConfig(level=logging.DEBUG)
+ else:
+ logging.basicConfig(level=logging.INFO)
+
+ run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/munjong/bin/recover_wide_quotation.py b/munjong/bin/recover_wide_quotation.py
new file mode 100755
index 0000000..0ee0792
--- /dev/null
+++ b/munjong/bin/recover_wide_quotation.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+recover wide char quotations in Sejong corpus
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import logging
+import os
+import sys
+
+from khaiii.munjong.sejong_corpus import Word, WORD_ID_PTN
+
+
+#############
+# constants #
+#############
+_QUOT_NORM = {
+ '"': '"',
+ '“': '"',
+ '”': '"',
+ "'": "'",
+ "‘": "'",
+ "’": "'",
+ "`": "'",
+}
+
+
+#############
+# functions #
+#############
+def _recover(word: Word):
+ """
+ recover wide char quotations
+ Args:
+ word: Word object
+ """
+ word_quots = [_ for _ in word.raw if _ in _QUOT_NORM]
+ morph_quots = []
+ for idx, morph in enumerate(word.morphs):
+ if morph.tag != 'SS' or morph.lex not in _QUOT_NORM:
+ continue
+ morph_quots.append((idx, morph))
+ quot_idx = len(morph_quots)-1
+ if len(word_quots) <= quot_idx or _QUOT_NORM[word_quots[quot_idx]] != _QUOT_NORM[morph.lex]:
+ logging.error('%d-th quots are different: %s', quot_idx+1, word)
+ return
+ if len(word_quots) != len(morph_quots):
+ morph_quots = [_ for _ in word.morph_str() if _ in _QUOT_NORM]
+ if word_quots != morph_quots:
+ logging.error('number of quots are different: %s', word)
+ return
+ for word_char, (idx, morph) in zip(word_quots, morph_quots):
+ if word_char == morph.lex:
+ continue
+ morph.lex = word_char
+
+
+def run():
+ """
+ run function which is the start point of program
+ """
+ file_name = os.path.basename(sys.stdin.name)
+ for line_num, line in enumerate(sys.stdin, start=1):
+ line = line.rstrip('\r\n')
+ if not WORD_ID_PTN.match(line):
+ print(line)
+ continue
+ word = Word.parse(line, file_name, line_num)
+ _recover(word)
+ print(word)
+
+
+########
+# main #
+########
+def main():
+ """
+ main function processes only argument parsing
+ """
+ parser = ArgumentParser(description='recover wide char quotations in Sejong corpus')
+ parser.add_argument('--input', help='input file ', metavar='FILE')
+ parser.add_argument('--output', help='output file ', metavar='FILE')
+ parser.add_argument('--debug', help='enable debug', action='store_true')
+ args = parser.parse_args()
+
+ if args.input:
+ sys.stdin = open(args.input, 'rt')
+ if args.output:
+ sys.stdout = open(args.output, 'wt')
+ if args.debug:
+ logging.basicConfig(level=logging.DEBUG)
+ else:
+ logging.basicConfig(level=logging.INFO)
+
+ run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/munjong/bin/remove_sejong_period_error.py b/munjong/bin/remove_sejong_period_error.py
new file mode 100755
index 0000000..59ffd86
--- /dev/null
+++ b/munjong/bin/remove_sejong_period_error.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+remove wrong sentence breaking marks after period error eojeol
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2017-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import logging
+import os
+import re
+import sys
+from typing import TextIO, Tuple
+
+from khaiii.munjong.sejong_corpus import Morph, WORD_ID_PTN
+
+
+#############
+# functions #
+#############
+def _get_three_lines(fin: TextIO) -> Tuple[str, str, str]:
+ """
+ get three lines tuple from file (generator)
+ Args:
+ fin: input file
+ Yields:
+ prev. prev. line
+ prev. line
+ curr. line
+ """
+ prev_prev_line = fin.readline().rstrip('\r\n')
+ prev_line = fin.readline().rstrip('\r\n')
+ # print first two lines
+ print(prev_prev_line)
+ print(prev_line)
+ for curr_line in fin:
+ curr_line = curr_line.rstrip('\r\n')
+ yield prev_prev_line, prev_line, curr_line
+ prev_prev_line = prev_line
+ prev_line = curr_line
+
+
+def _is_known_period_error_eojeol(line: str) -> bool:
+ """
+ 알려진 특정 문장분리 오류를 포함하는 어절인 지 여부
+ Args:
+ line: line (eojeol)
+ Returns:
+ whether has error or not
+ """
+ cols = line.split('\t')
+ if len(cols) != 3 or not WORD_ID_PTN.match(cols[0]):
+ return False
+ if '/SF + ' not in cols[2] or re.match(r'.+/EF \+ ./SF$', cols[2]):
+ return False
+ if re.match(r'.+/SF \+ [\'"’”]/SS$', cols[2]):
+ return False
+ morphs = [Morph.parse(_) for _ in cols[2].split(' + ')]
+ tags_str = '+'.join([_.tag for _ in morphs])
+ if 'SN+SF+SN' in tags_str and not tags_str.endswith('+SF'):
+ # 4.6판: 4/SN + ./SF + 6/SN + 판/NNB
+ if 'XSN+SF+SN' not in tags_str:
+ return True
+ elif 'SL+SF+SL' in tags_str and not tags_str.endswith('+SF'):
+ # S.M.오너: S/SL + ./SF + M/SL + ./SF + 오너/NNG
+ return True
+ return False
+
+
+def run():
+ """
+ run function which is the start point of program
+ """
+ file_name = os.path.basename(sys.stdin.name)
+ for line_num, (prev_prev_line, prev_line, curr_line) in enumerate(_get_three_lines(sys.stdin),
+ start=1):
+ if curr_line == '
' and _is_known_period_error_eojeol(prev_line):
+ continue
+ elif prev_line == '' and curr_line == '' and \
+ _is_known_period_error_eojeol(prev_prev_line):
+ logging.info('%s:%d\t%s', file_name, line_num, prev_prev_line)
+ continue
+ print(curr_line)
+
+
+########
+# main #
+########
+def main():
+ """
+ main function processes only argument parsing
+ """
+ parser = ArgumentParser(description='remove wrong sentence breaking marks after'
+ ' period error eojeol')
+ parser.add_argument('--input', help='input file ', metavar='FILE')
+ parser.add_argument('--output', help='output file ', metavar='FILE')
+ parser.add_argument('--debug', help='enable debug', action='store_true')
+ args = parser.parse_args()
+
+ if args.input:
+ sys.stdin = open(args.input, 'rt')
+ if args.output:
+ sys.stdout = open(args.output, 'wt')
+ if args.debug:
+ logging.basicConfig(level=logging.DEBUG)
+ else:
+ logging.basicConfig(level=logging.INFO)
+
+ run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/src/main/python/khaiii/resource/jaso.py b/src/main/python/khaiii/resource/jaso.py
index 1f5e184..7b667b8 100644
--- a/src/main/python/khaiii/resource/jaso.py
+++ b/src/main/python/khaiii/resource/jaso.py
@@ -17,21 +17,63 @@
#############
# constants #
#############
-_FIRST = ['\u3131', '\u3132', '\u3134', '\u3137', '\u3138', # 초성
- '\u3139', '\u3141', '\u3142', '\u3143', '\u3145',
- '\u3146', '\u3147', '\u3148', '\u3149', '\u314a',
- '\u314b', '\u314c', '\u314d', '\u314e']
-_MIDDLE = ['\u314f', '\u3150', '\u3151', '\u3152', '\u3153', # 중성
- '\u3154', '\u3155', '\u3156', '\u3157', '\u3158',
- '\u3159', '\u315a', '\u315b', '\u315c', '\u315d',
- '\u315e', '\u315f', '\u3160', '\u3161', '\u3162',
- '\u3163']
-_LAST = ['\u3131', '\u3132', '\u3133', '\u3134', '\u3135', # 종성
- '\u3136', '\u3137', '\u3139', '\u313a', '\u313b',
- '\u313c', '\u313d', '\u313e', '\u313f', '\u3140',
- '\u3141', '\u3142', '\u3144', '\u3145', '\u3146',
- '\u3147', '\u3148', '\u314a', '\u314b', '\u314c',
- '\u314d', '\u314e']
+# 한글 자모 호환 영역 (초성과 종성이 같음. 두벌식 키보드로 입력할 때 들어가는 코드)
+_FIRST_COMPAT = ['\u3131', '\u3132', '\u3134', '\u3137', '\u3138', # 초성
+ '\u3139', '\u3141', '\u3142', '\u3143', '\u3145',
+ '\u3146', '\u3147', '\u3148', '\u3149', '\u314a',
+ '\u314b', '\u314c', '\u314d', '\u314e']
+_MIDDLE_COMPAT = ['\u314f', '\u3150', '\u3151', '\u3152', '\u3153', # 중성
+ '\u3154', '\u3155', '\u3156', '\u3157', '\u3158',
+ '\u3159', '\u315a', '\u315b', '\u315c', '\u315d',
+ '\u315e', '\u315f', '\u3160', '\u3161', '\u3162',
+ '\u3163']
+_LAST_COMPAT = ['\u3131', '\u3132', '\u3133', '\u3134', '\u3135', # 종성
+ '\u3136', '\u3137', '\u3139', '\u313a', '\u313b',
+ '\u313c', '\u313d', '\u313e', '\u313f', '\u3140',
+ '\u3141', '\u3142', '\u3144', '\u3145', '\u3146',
+ '\u3147', '\u3148', '\u314a', '\u314b', '\u314c',
+ '\u314d', '\u314e']
+_ALL_COMPAT = _FIRST_COMPAT + _MIDDLE_COMPAT + _LAST_COMPAT
+
+# 한글 자모 영역 (초성과 종성이 다름. 세종 코퍼스에서 사용한 코드)
+_FIRST_JAMO = ['\u1100', '\u1101', '\u1102', '\u1103', '\u1104', # 초성
+ '\u1105', '\u1106', '\u1107', '\u1108', '\u1109',
+ '\u110a', '\u110b', '\u110c', '\u110d', '\u110e',
+ '\u110f', '\u1110', '\u1111', '\u1112']
+_MIDDLE_JAMO = ['\u1161', '\u1162', '\u1163', '\u1164', '\u1165', # 중성
+ '\u1166', '\u1167', '\u1168', '\u1169', '\u116a',
+ '\u116b', '\u116c', '\u116d', '\u116e', '\u116f',
+ '\u1170', '\u1171', '\u1172', '\u1173', '\u1174',
+ '\u1175']
+_LAST_JAMO = ['\u11a8', '\u11a9', '\u11aa', '\u11ab', '\u11ac', # 종성
+ '\u11ad', '\u11ae', '\u11af', '\u11b0', '\u11b1',
+ '\u11b2', '\u11b3', '\u11b4', '\u11b5', '\u11b6',
+ '\u11b7', '\u11b8', '\u11b9', '\u11ba', '\u11bb',
+ '\u11bc', '\u11bd', '\u11be', '\u11bf', '\u11c0',
+ '\u11c1', '\u11c2']
+_ALL_JAMO = _FIRST_JAMO + _MIDDLE_JAMO + _LAST_JAMO
+_ALL_JAMO_SET = set(_ALL_JAMO)
+_JAMO_TO_COMPAT = dict(zip(_ALL_JAMO, _ALL_COMPAT))
+
+# 반각 자모 영역 (호환 영역과 비슷하게 초성과 종성이 같으나 글자 폭이 절반인 코드)
+_FIRST_HALFWIDTH = ['\uffa1', '\uffa2', '\uffa4', '\uffa7', '\uffa8', # 초성
+ '\uffa9', '\uffb1', '\uffb2', '\uffb3', '\uffb5',
+ '\uffb6', '\uffb7', '\uffb8', '\uffb9', '\uffba',
+ '\uffbb', '\uffbc', '\uffbd', '\uffbe']
+_MIDDLE_HALFWIDTH = ['\uffc2', '\uffc3', '\uffc4', '\uffc5', '\uffc6', # 중성
+ '\uffc7', '\uffca', '\uffcb', '\uffcc', '\uffcd',
+ '\uffce', '\uffcf', '\uffd2', '\uffd3', '\uffd4',
+ '\uffd5', '\uffd6', '\uffd7', '\uffda', '\uffdb',
+ '\uffdc']
+_LAST_HALFWIDTH = ['\uffa1', '\uffa2', '\uffa3', '\uffa4', '\uffa5', # 종성
+ '\uffa6', '\uffa7', '\uffa9', '\uffaa', '\uffab',
+ '\uffac', '\uffad', '\uffae', '\uffaf', '\uffb0',
+ '\uffb1', '\uffb2', '\uffb4', '\uffb5', '\uffb6',
+ '\uffb7', '\uffb8', '\uffba', '\uffbb', '\uffbc',
+ '\uffbd', '\uffbe']
+_ALL_HALFWIDTH = _FIRST_HALFWIDTH + _MIDDLE_HALFWIDTH + _LAST_HALFWIDTH
+_ALL_HALFWIDTH_SET = set(_ALL_HALFWIDTH)
+_HALFWIDTH_TO_COMPAT = dict(zip(_ALL_HALFWIDTH, _ALL_COMPAT))
#############
@@ -54,12 +96,12 @@ def _decomp_char(char: str) -> Tuple[str, str, str]:
first_idx = first_start // 21
middle_idx = first_start % 21
- first = _FIRST[first_idx]
- middle = _MIDDLE[middle_idx]
+ first = _FIRST_COMPAT[first_idx]
+ middle = _MIDDLE_COMPAT[middle_idx]
if not last_idx:
return first, middle
- last = _LAST[last_idx-1]
+ last = _LAST_COMPAT[last_idx-1]
return first, middle, last
@@ -71,6 +113,9 @@ def decompose(text: str) -> str:
Returns:
자소 분해된 텍스트
"""
+ if not text:
+ return text
+
decomposed = []
for char in text:
code = ord(char)
@@ -79,3 +124,25 @@ def decompose(text: str) -> str:
else:
decomposed.extend(_decomp_char(char))
return ''.join(decomposed)
+
+
+def norm_compat(text: str) -> str:
+ """
+ 유니코드 내 한글 자소를 호환 영역으로 정규화한다.
+ Args:
+ text: 한글 텍스트
+ Returns:
+ 자소가 호환 영역으로 정규화된 텍스트
+ """
+ if not text:
+ return text
+
+ normalized = []
+ for char in text:
+ if char in _ALL_JAMO_SET:
+ normalized.append(_JAMO_TO_COMPAT[char])
+ elif char in _ALL_HALFWIDTH_SET:
+ normalized.append(_HALFWIDTH_TO_COMPAT[char])
+ else:
+ normalized.append(char)
+ return ''.join(normalized)