From 767d05734b845d89ed51e9d605d20217116842e4 Mon Sep 17 00:00:00 2001
From: Jamie <jamie.lim@kakaocorp.com>
Date: Mon, 28 Jan 2019 20:21:51 +0900
Subject: [PATCH] =?UTF-8?q?=EC=84=B8=EC=A2=85=20=EC=BD=94=ED=8D=BC?=
 =?UTF-8?q?=EC=8A=A4=EC=9D=98=20=EC=98=A4=EB=A5=98=EC=99=80=20=EA=B4=80?=
 =?UTF-8?q?=EB=A0=A8=EB=90=9C=20=EA=B0=81=EC=A2=85=20=EC=8A=A4=ED=81=AC?=
 =?UTF-8?q?=EB=A6=BD=ED=8A=B8=20=EC=B6=94=EA=B0=80=20#3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 munjong/bin/convert_jamo_to_compat.py     |  79 ++++++++++++++
 munjong/bin/detect_sejong_period_error.py | 106 +++++++++++++++++++
 munjong/bin/fix_final_symbol_error.py     |  86 ++++++++++++++++
 munjong/bin/recover_english_case.py       |  96 +++++++++++++++++
 munjong/bin/recover_raw_morph_mismatch.py | 101 ++++++++++++++++++
 munjong/bin/recover_wide_quotation.py     | 108 +++++++++++++++++++
 munjong/bin/remove_sejong_period_error.py | 120 ++++++++++++++++++++++
 src/main/python/khaiii/resource/jaso.py   | 103 +++++++++++++++----
 8 files changed, 781 insertions(+), 18 deletions(-)
 create mode 100755 munjong/bin/convert_jamo_to_compat.py
 create mode 100755 munjong/bin/detect_sejong_period_error.py
 create mode 100755 munjong/bin/fix_final_symbol_error.py
 create mode 100755 munjong/bin/recover_english_case.py
 create mode 100755 munjong/bin/recover_raw_morph_mismatch.py
 create mode 100755 munjong/bin/recover_wide_quotation.py
 create mode 100755 munjong/bin/remove_sejong_period_error.py

diff --git a/munjong/bin/convert_jamo_to_compat.py b/munjong/bin/convert_jamo_to_compat.py
new file mode 100755
index 0000000..6ea1426
--- /dev/null
+++ b/munjong/bin/convert_jamo_to_compat.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+한글 자모 영역의 코드를 호환 영역으로 변환
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import logging
+import sys
+
+from khaiii.munjong.sejong_corpus import WORD_ID_PTN
+from khaiii.resource.jaso import norm_compat
+
+
+#############
+# functions #
+#############
+def _norm(text: str) -> str:
+    """
+    정규화를 수행하는 함수
+    Args:
+        text:  입력 텍스트
+    Returns:
+        정규화된 텍스트
+    """
+    normalized = norm_compat(text)
+    normalized = normalized.replace('ᆞ', 'ㆍ')    # 0x119e -> 0x318d
+    normalized = normalized.replace('ᄝ', 'ㅱ')    # 0x111d -> 0x3171
+    return normalized
+
+
+def run():
+    """
+    run function which is the start point of program
+    """
+    for line in sys.stdin:
+        line = line.rstrip('\r\n')
+        if not WORD_ID_PTN.match(line):
+            print(line)
+            continue
+        wid, word, morph = line.split('\t')
+        print('{}\t{}\t{}'.format(wid, _norm(word), _norm(morph)))
+
+
+########
+# main #
+########
+def main():
+    """
+    main function processes only argument parsing
+    """
+    parser = ArgumentParser(description='한글 자모 영역의 코드를 호환 영역으로 변환')
+    parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
+    parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
+    parser.add_argument('--debug', help='enable debug', action='store_true')
+    args = parser.parse_args()
+
+    if args.input:
+        sys.stdin = open(args.input, 'r', encoding='UTF-8')
+    if args.output:
+        sys.stdout = open(args.output, 'w', encoding='UTF-8')
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    run()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/munjong/bin/detect_sejong_period_error.py b/munjong/bin/detect_sejong_period_error.py
new file mode 100755
index 0000000..7dfd4fb
--- /dev/null
+++ b/munjong/bin/detect_sejong_period_error.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+detect period error of Sejong corpus
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import logging
+import os
+import re
+import sys
+from typing import Iterator, TextIO, Tuple
+
+from khaiii.munjong.sejong_corpus import Morph, WORD_ID_PTN
+
+
+#############
+# functions #
+#############
+def _get_two_lines(fin: TextIO) -> Iterator[Tuple[str, str]]:
+    """
+    get two lines tuple from file (generator)
+    Args:
+        fin:  input file
+    Yields:
+        current line
+        next line
+    """
+    curr_line = fin.readline().rstrip('\r\n')
+    for next_line in fin:
+        next_line = next_line.rstrip('\r\n')
+        yield curr_line, next_line
+        curr_line = next_line
+
+
+def _is_correct_eos(line: str) -> bool:
+    """
+    whether correct end of sentence or not
+    Args:
+        line:  line (word)
+    Returns:
+        whether correct or not
+    """
+    _, _, morphs_str = line.split('\t')
+    if re.match(r'.+/EF \+ ./SF$', morphs_str):
+        return True
+    if re.match(r'.+/SF \+ [\'"’”」\]]/SS$', morphs_str):
+        return True
+    morphs = [Morph.parse(_) for _ in morphs_str.split(' + ')]
+    tags_str = '+'.join([_.tag for _ in morphs])
+    if tags_str.endswith('+SF+SS+JKQ') or tags_str.endswith('+SF+SS+VCP+ETM'):
+        return True
+    return False
+
+
+def run():
+    """
+    run function which is the start point of program
+    """
+    file_name = os.path.basename(sys.stdin.name)
+    for line_num, (curr_line, next_line) in enumerate(_get_two_lines(sys.stdin), start=1):
+        cols = curr_line.split('\t')
+        if len(cols) != 3 or not WORD_ID_PTN.match(cols[0]):
+            continue
+        if '/SF + ' not in cols[2] or not next_line.startswith('</'):
+            continue
+        if _is_correct_eos(curr_line):
+            continue
+        print('{}:{}\t{}'.format(file_name, line_num, curr_line))
+
+
+########
+# main #
+########
+def main():
+    """
+    main function processes only argument parsing
+    """
+    parser = ArgumentParser(description='detect period error of Sejong corpus')
+    parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
+    parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
+    parser.add_argument('--debug', help='enable debug', action='store_true')
+    args = parser.parse_args()
+
+    if args.input:
+        sys.stdin = open(args.input, 'rt')
+    if args.output:
+        sys.stdout = open(args.output, 'wt')
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    run()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/munjong/bin/fix_final_symbol_error.py b/munjong/bin/fix_final_symbol_error.py
new file mode 100755
index 0000000..5ba1644
--- /dev/null
+++ b/munjong/bin/fix_final_symbol_error.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+fix final symbol errors on Sejong corpus
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import logging
+import os
+import sys
+
+from khaiii.munjong.sejong_corpus import Morph, Word, WORD_ID_PTN
+
+
+#############
+# functions #
+#############
+def _attach_missing_symbol(word: Word):
+    """
+    attach missing symbol
+    Args:
+        word:  Word object
+    """
+    raw_word = word.raw
+    raw_morph = ''.join([_.lex for _ in word.morphs])
+    if not raw_word.startswith(raw_morph) or len(raw_word) != len(raw_morph)+1:
+        return
+    last_symbol = raw_word[-1]
+    if last_symbol == '.' and word.morphs[-1].tag == 'EC':
+        word.morphs.append(Morph('.', 'SF'))
+    elif last_symbol == ',':
+        word.morphs.append(Morph(',', 'SP'))
+    elif last_symbol == '"':
+        word.morphs.append(Morph('"', 'SS'))
+
+
+def run():
+    """
+    run function which is the start point of program
+    """
+    file_name = os.path.basename(sys.stdin.name)
+    for line_num, line in enumerate(sys.stdin, start=1):
+        line = line.rstrip('\r\n')
+        if not WORD_ID_PTN.match(line):
+            print(line)
+            continue
+        word = Word.parse(line, file_name, line_num)
+        _attach_missing_symbol(word)
+        print(word)
+
+
+########
+# main #
+########
+def main():
+    """
+    main function processes only argument parsing
+    """
+    parser = ArgumentParser(description='fix final symbol errors on Sejong corpus')
+    parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
+    parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
+    parser.add_argument('--debug', help='enable debug', action='store_true')
+    args = parser.parse_args()
+
+    if args.input:
+        sys.stdin = open(args.input, 'rt')
+    if args.output:
+        sys.stdout = open(args.output, 'wt')
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    run()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/munjong/bin/recover_english_case.py b/munjong/bin/recover_english_case.py
new file mode 100755
index 0000000..5a0dd9d
--- /dev/null
+++ b/munjong/bin/recover_english_case.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+recover cases of English letters in Sejong corpus
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import copy
+import logging
+import os
+import re
+import sys
+
+from khaiii.munjong.sejong_corpus import Word, WORD_ID_PTN
+
+
+#############
+# functions #
+#############
+def _recover(word: Word):
+    """
+    recover cases
+    Args:
+        word:  Word object
+    """
+    word_letters = [_ for _ in word.raw if re.match(r'[a-zA-Z]', _)]
+    letter_idx = -1
+    is_recovered = False
+    word_copy = copy.deepcopy(word)
+    for morph in word_copy.morphs:
+        for idx, char in enumerate(morph.lex):
+            if not re.match(r'[a-zA-Z]', char):
+                continue
+            letter_idx += 1
+            if word_letters[letter_idx] == char:
+                continue
+            morph.lex = morph.lex[:idx] + word_letters[letter_idx] + morph.lex[idx+1:]
+            is_recovered = True
+    if is_recovered:
+        logging.info('%s  =>  %s', str(word), word_copy.morph_str())
+        word.morphs = word_copy.morphs
+
+
+def run():
+    """
+    run function which is the start point of program
+    """
+    file_name = os.path.basename(sys.stdin.name)
+    for line_num, line in enumerate(sys.stdin, start=1):
+        line = line.rstrip('\r\n')
+        if not WORD_ID_PTN.match(line):
+            print(line)
+            continue
+        word = Word.parse(line, file_name, line_num)
+        try:
+            _recover(word)
+        except IndexError as idx_err:
+            logging.error('%s(%d): %s: %s', file_name, line_num, idx_err, word)
+        print(word)
+
+
+########
+# main #
+########
+def main():
+    """
+    main function processes only argument parsing
+    """
+    parser = ArgumentParser(description='recover cases of English letters in Sejong corpus')
+    parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
+    parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
+    parser.add_argument('--debug', help='enable debug', action='store_true')
+    args = parser.parse_args()
+
+    if args.input:
+        sys.stdin = open(args.input, 'r', encoding='UTF-8')
+    if args.output:
+        sys.stdout = open(args.output, 'w', encoding='UTF-8')
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    run()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/munjong/bin/recover_raw_morph_mismatch.py b/munjong/bin/recover_raw_morph_mismatch.py
new file mode 100755
index 0000000..2bcb136
--- /dev/null
+++ b/munjong/bin/recover_raw_morph_mismatch.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+어절의 원문과 형태소 분석 결과의 문자가 정규화하면 같지만 코드가 다른 경우 원문의 문자로 복원
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import logging
+import os
+import sys
+
+from khaiii.munjong.sejong_corpus import Morph, ParseError, Word, WORD_ID_PTN
+
+
+#############
+# functions #
+#############
+def _recover(line: str) -> str:
+    """
+    문자를 복원한다.
+    Args:
+        line:  어절 라인
+    Returns:
+        복원된 라인
+    """
+    wid, raw, morphs_str = line.split('\t')
+    raw_idx = 0
+    morphs = []
+    for token_str in morphs_str.split(' + '):
+        morph = Morph.parse(token_str)
+        lex = []
+        for _ in range(len(morph.lex)):
+            try:
+                lex.append(raw[raw_idx])
+                raw_idx += 1
+            except IndexError as idx_err:
+                logging.error(line)
+                raise idx_err
+        morph.lex = ''.join(lex)
+        morphs.append(morph)
+    morphs_new = ' + '.join([str(m) for m in morphs])
+    logging.debug('%s\t%s\t%s  =>  %s', wid, raw, morphs_str, morphs_new)
+    return '{}\t{}\t{}'.format(wid, raw, morphs_new)
+
+
+def run():
+    """
+    run function which is the start point of program
+    """
+    file_name = os.path.basename(sys.stdin.name)
+    for line_num, line in enumerate(sys.stdin, start=1):
+        line = line.rstrip('\r\n')
+        if not WORD_ID_PTN.match(line):
+            print(line)
+            continue
+        try:
+            Word.parse(line, file_name, line_num)
+        except ParseError as par_err:
+            if 'raw-morph mismatch' in str(par_err):
+                line = _recover(line)
+            else:
+                raise par_err
+        print(line)
+
+
+########
+# main #
+########
+def main():
+    """
+    main function processes only argument parsing
+    """
+    parser = ArgumentParser(description='어절의 원문과 형태소 분석 결과의 문자가 정규화하면 같지만 코드가 다른 경우'
+                                        ' 원문의 문자로 복원')
+    parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
+    parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
+    parser.add_argument('--debug', help='enable debug', action='store_true')
+    args = parser.parse_args()
+
+    if args.input:
+        sys.stdin = open(args.input, 'r', encoding='UTF-8')
+    if args.output:
+        sys.stdout = open(args.output, 'w', encoding='UTF-8')
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    run()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/munjong/bin/recover_wide_quotation.py b/munjong/bin/recover_wide_quotation.py
new file mode 100755
index 0000000..0ee0792
--- /dev/null
+++ b/munjong/bin/recover_wide_quotation.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+recover wide char quotations in Sejong corpus
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import logging
+import os
+import sys
+
+from khaiii.munjong.sejong_corpus import Word, WORD_ID_PTN
+
+
+#############
+# constants #
+#############
+_QUOT_NORM = {
+    '"': '"',
+    '“': '"',
+    '”': '"',
+    "'": "'",
+    "‘": "'",
+    "’": "'",
+    "`": "'",
+}
+
+
+#############
+# functions #
+#############
+def _recover(word: Word):
+    """
+    recover wide char quotations
+    Args:
+        word:  Word object
+    """
+    word_quots = [_ for _ in word.raw if _ in _QUOT_NORM]
+    morph_quots = []
+    for idx, morph in enumerate(word.morphs):
+        if morph.tag != 'SS' or morph.lex not in _QUOT_NORM:
+            continue
+        morph_quots.append((idx, morph))
+        quot_idx = len(morph_quots)-1
+        if len(word_quots) <= quot_idx or _QUOT_NORM[word_quots[quot_idx]] != _QUOT_NORM[morph.lex]:
+            logging.error('%d-th quots are different: %s', quot_idx+1, word)
+            return
+    if len(word_quots) != len(morph_quots):
+        morph_quots = [_ for _ in word.morph_str() if _ in _QUOT_NORM]
+        if word_quots != morph_quots:
+            logging.error('number of quots are different: %s', word)
+        return
+    for word_char, (idx, morph) in zip(word_quots, morph_quots):
+        if word_char == morph.lex:
+            continue
+        morph.lex = word_char
+
+
+def run():
+    """
+    run function which is the start point of program
+    """
+    file_name = os.path.basename(sys.stdin.name)
+    for line_num, line in enumerate(sys.stdin, start=1):
+        line = line.rstrip('\r\n')
+        if not WORD_ID_PTN.match(line):
+            print(line)
+            continue
+        word = Word.parse(line, file_name, line_num)
+        _recover(word)
+        print(word)
+
+
+########
+# main #
+########
+def main():
+    """
+    main function processes only argument parsing
+    """
+    parser = ArgumentParser(description='recover wide char quotations in Sejong corpus')
+    parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
+    parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
+    parser.add_argument('--debug', help='enable debug', action='store_true')
+    args = parser.parse_args()
+
+    if args.input:
+        sys.stdin = open(args.input, 'rt')
+    if args.output:
+        sys.stdout = open(args.output, 'wt')
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    run()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/munjong/bin/remove_sejong_period_error.py b/munjong/bin/remove_sejong_period_error.py
new file mode 100755
index 0000000..59ffd86
--- /dev/null
+++ b/munjong/bin/remove_sejong_period_error.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+remove wrong sentence breaking marks after period error eojeol
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2017-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser
+import logging
+import os
+import re
+import sys
+from typing import TextIO, Tuple
+
+from khaiii.munjong.sejong_corpus import Morph, WORD_ID_PTN
+
+
+#############
+# functions #
+#############
+def _get_three_lines(fin: TextIO) -> Tuple[str, str, str]:
+    """
+    get three lines tuple from file (generator)
+    Args:
+        fin:  input file
+    Yields:
+        prev. prev. line
+        prev. line
+        curr. line
+    """
+    prev_prev_line = fin.readline().rstrip('\r\n')
+    prev_line = fin.readline().rstrip('\r\n')
+    # print first two lines
+    print(prev_prev_line)
+    print(prev_line)
+    for curr_line in fin:
+        curr_line = curr_line.rstrip('\r\n')
+        yield prev_prev_line, prev_line, curr_line
+        prev_prev_line = prev_line
+        prev_line = curr_line
+
+
+def _is_known_period_error_eojeol(line: str) -> bool:
+    """
+    알려진 특정 문장분리 오류를 포함하는 어절인 지 여부
+    Args:
+        line:  line (eojeol)
+    Returns:
+        whether has error or not
+    """
+    cols = line.split('\t')
+    if len(cols) != 3 or not WORD_ID_PTN.match(cols[0]):
+        return False
+    if '/SF + ' not in cols[2] or re.match(r'.+/EF \+ ./SF$', cols[2]):
+        return False
+    if re.match(r'.+/SF \+ [\'"’”]/SS$', cols[2]):
+        return False
+    morphs = [Morph.parse(_) for _ in cols[2].split(' + ')]
+    tags_str = '+'.join([_.tag for _ in morphs])
+    if 'SN+SF+SN' in tags_str and not tags_str.endswith('+SF'):
+        # 4.6판: 4/SN + ./SF + 6/SN + 판/NNB
+        if 'XSN+SF+SN' not in tags_str:
+            return True
+    elif 'SL+SF+SL' in tags_str and not tags_str.endswith('+SF'):
+        # S.M.오너: S/SL + ./SF + M/SL + ./SF + 오너/NNG
+        return True
+    return False
+
+
+def run():
+    """
+    run function which is the start point of program
+    """
+    file_name = os.path.basename(sys.stdin.name)
+    for line_num, (prev_prev_line, prev_line, curr_line) in enumerate(_get_three_lines(sys.stdin),
+                                                                      start=1):
+        if curr_line == '</p>' and _is_known_period_error_eojeol(prev_line):
+            continue
+        elif prev_line == '</p>' and curr_line == '<p>' and \
+                _is_known_period_error_eojeol(prev_prev_line):
+            logging.info('%s:%d\t%s', file_name, line_num, prev_prev_line)
+            continue
+        print(curr_line)
+
+
+########
+# main #
+########
+def main():
+    """
+    main function processes only argument parsing
+    """
+    parser = ArgumentParser(description='remove wrong sentence breaking marks after'
+                                        ' period error eojeol')
+    parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
+    parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
+    parser.add_argument('--debug', help='enable debug', action='store_true')
+    args = parser.parse_args()
+
+    if args.input:
+        sys.stdin = open(args.input, 'rt')
+    if args.output:
+        sys.stdout = open(args.output, 'wt')
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    run()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/main/python/khaiii/resource/jaso.py b/src/main/python/khaiii/resource/jaso.py
index 1f5e184..7b667b8 100644
--- a/src/main/python/khaiii/resource/jaso.py
+++ b/src/main/python/khaiii/resource/jaso.py
@@ -17,21 +17,63 @@
 #############
 # constants #
 #############
-_FIRST = ['\u3131', '\u3132', '\u3134', '\u3137', '\u3138',    # 초성
-          '\u3139', '\u3141', '\u3142', '\u3143', '\u3145',
-          '\u3146', '\u3147', '\u3148', '\u3149', '\u314a',
-          '\u314b', '\u314c', '\u314d', '\u314e']
-_MIDDLE = ['\u314f', '\u3150', '\u3151', '\u3152', '\u3153',    # 중성
-           '\u3154', '\u3155', '\u3156', '\u3157', '\u3158',
-           '\u3159', '\u315a', '\u315b', '\u315c', '\u315d',
-           '\u315e', '\u315f', '\u3160', '\u3161', '\u3162',
-           '\u3163']
-_LAST = ['\u3131', '\u3132', '\u3133', '\u3134', '\u3135',    # 종성
-         '\u3136', '\u3137', '\u3139', '\u313a', '\u313b',
-         '\u313c', '\u313d', '\u313e', '\u313f', '\u3140',
-         '\u3141', '\u3142', '\u3144', '\u3145', '\u3146',
-         '\u3147', '\u3148', '\u314a', '\u314b', '\u314c',
-         '\u314d', '\u314e']
+# 한글 자모 호환 영역 (초성과 종성이 같음. 두벌식 키보드로 입력할 때 들어가는 코드)
+_FIRST_COMPAT = ['\u3131', '\u3132', '\u3134', '\u3137', '\u3138',    # 초성
+                 '\u3139', '\u3141', '\u3142', '\u3143', '\u3145',
+                 '\u3146', '\u3147', '\u3148', '\u3149', '\u314a',
+                 '\u314b', '\u314c', '\u314d', '\u314e']
+_MIDDLE_COMPAT = ['\u314f', '\u3150', '\u3151', '\u3152', '\u3153',    # 중성
+                  '\u3154', '\u3155', '\u3156', '\u3157', '\u3158',
+                  '\u3159', '\u315a', '\u315b', '\u315c', '\u315d',
+                  '\u315e', '\u315f', '\u3160', '\u3161', '\u3162',
+                  '\u3163']
+_LAST_COMPAT = ['\u3131', '\u3132', '\u3133', '\u3134', '\u3135',    # 종성
+                '\u3136', '\u3137', '\u3139', '\u313a', '\u313b',
+                '\u313c', '\u313d', '\u313e', '\u313f', '\u3140',
+                '\u3141', '\u3142', '\u3144', '\u3145', '\u3146',
+                '\u3147', '\u3148', '\u314a', '\u314b', '\u314c',
+                '\u314d', '\u314e']
+_ALL_COMPAT = _FIRST_COMPAT + _MIDDLE_COMPAT + _LAST_COMPAT
+
+# 한글 자모 영역 (초성과 종성이 다름. 세종 코퍼스에서 사용한 코드)
+_FIRST_JAMO = ['\u1100', '\u1101', '\u1102', '\u1103', '\u1104',    # 초성
+               '\u1105', '\u1106', '\u1107', '\u1108', '\u1109',
+               '\u110a', '\u110b', '\u110c', '\u110d', '\u110e',
+               '\u110f', '\u1110', '\u1111', '\u1112']
+_MIDDLE_JAMO = ['\u1161', '\u1162', '\u1163', '\u1164', '\u1165',    # 중성
+                '\u1166', '\u1167', '\u1168', '\u1169', '\u116a',
+                '\u116b', '\u116c', '\u116d', '\u116e', '\u116f',
+                '\u1170', '\u1171', '\u1172', '\u1173', '\u1174',
+                '\u1175']
+_LAST_JAMO = ['\u11a8', '\u11a9', '\u11aa', '\u11ab', '\u11ac',    # 종성
+              '\u11ad', '\u11ae', '\u11af', '\u11b0', '\u11b1',
+              '\u11b2', '\u11b3', '\u11b4', '\u11b5', '\u11b6',
+              '\u11b7', '\u11b8', '\u11b9', '\u11ba', '\u11bb',
+              '\u11bc', '\u11bd', '\u11be', '\u11bf', '\u11c0',
+              '\u11c1', '\u11c2']
+_ALL_JAMO = _FIRST_JAMO + _MIDDLE_JAMO + _LAST_JAMO
+_ALL_JAMO_SET = set(_ALL_JAMO)
+_JAMO_TO_COMPAT = dict(zip(_ALL_JAMO, _ALL_COMPAT))
+
+# 반각 자모 영역 (호환 영역과 비슷하게 초성과 종성이 같으나 글자 폭이 절반인 코드)
+_FIRST_HALFWIDTH = ['\uffa1', '\uffa2', '\uffa4', '\uffa7', '\uffa8',    # 초성
+                    '\uffa9', '\uffb1', '\uffb2', '\uffb3', '\uffb5',
+                    '\uffb6', '\uffb7', '\uffb8', '\uffb9', '\uffba',
+                    '\uffbb', '\uffbc', '\uffbd', '\uffbe']
+_MIDDLE_HALFWIDTH = ['\uffc2', '\uffc3', '\uffc4', '\uffc5', '\uffc6',    # 중성
+                     '\uffc7', '\uffca', '\uffcb', '\uffcc', '\uffcd',
+                     '\uffce', '\uffcf', '\uffd2', '\uffd3', '\uffd4',
+                     '\uffd5', '\uffd6', '\uffd7', '\uffda', '\uffdb',
+                     '\uffdc']
+_LAST_HALFWIDTH = ['\uffa1', '\uffa2', '\uffa3', '\uffa4', '\uffa5',    # 종성
+                   '\uffa6', '\uffa7', '\uffa9', '\uffaa', '\uffab',
+                   '\uffac', '\uffad', '\uffae', '\uffaf', '\uffb0',
+                   '\uffb1', '\uffb2', '\uffb4', '\uffb5', '\uffb6',
+                   '\uffb7', '\uffb8', '\uffba', '\uffbb', '\uffbc',
+                   '\uffbd', '\uffbe']
+_ALL_HALFWIDTH = _FIRST_HALFWIDTH + _MIDDLE_HALFWIDTH + _LAST_HALFWIDTH
+_ALL_HALFWIDTH_SET = set(_ALL_HALFWIDTH)
+_HALFWIDTH_TO_COMPAT = dict(zip(_ALL_HALFWIDTH, _ALL_COMPAT))
 
 
 #############
@@ -54,12 +96,12 @@ def _decomp_char(char: str) -> Tuple[str, str, str]:
     first_idx = first_start // 21
     middle_idx = first_start % 21
 
-    first = _FIRST[first_idx]
-    middle = _MIDDLE[middle_idx]
+    first = _FIRST_COMPAT[first_idx]
+    middle = _MIDDLE_COMPAT[middle_idx]
     if not last_idx:
         return first, middle
 
-    last = _LAST[last_idx-1]
+    last = _LAST_COMPAT[last_idx-1]
     return first, middle, last
 
 
@@ -71,6 +113,9 @@ def decompose(text: str) -> str:
     Returns:
         자소 분해된 텍스트
     """
+    if not text:
+        return text
+
     decomposed = []
     for char in text:
         code = ord(char)
@@ -79,3 +124,25 @@ def decompose(text: str) -> str:
         else:
             decomposed.extend(_decomp_char(char))
     return ''.join(decomposed)
+
+
+def norm_compat(text: str) -> str:
+    """
+    유니코드 내 한글 자소를 호환 영역으로 정규화한다.
+    Args:
+        text:  한글 텍스트
+    Returns:
+        자소가 호환 영역으로 정규화된 텍스트
+    """
+    if not text:
+        return text
+
+    normalized = []
+    for char in text:
+        if char in _ALL_JAMO_SET:
+            normalized.append(_JAMO_TO_COMPAT[char])
+        elif char in _ALL_HALFWIDTH_SET:
+            normalized.append(_HALFWIDTH_TO_COMPAT[char])
+        else:
+            normalized.append(char)
+    return ''.join(normalized)