diff --git a/rsc/Makefile b/rsc/Makefile index a99039b..36d1a3a 100644 --- a/rsc/Makefile +++ b/rsc/Makefile @@ -1,6 +1,6 @@ HOME_DIR = . BIN_DIR = $(HOME_DIR)/bin -LIB_DIR = $(HOME_DIR)/lib +SRC_PYTHON = $(HOME_DIR)/../src/main/python RSC_SRC = $(HOME_DIR)/src PREFIX = /usr/local RSC_DIR = $(PREFIX)/share/khaiii @@ -35,22 +35,22 @@ all: $(MODEL) $(PREANAL) $(RESTORE) $(ERRPATCH) $(wordlist 2,100,$(MODEL)): $(firstword $(MODEL)) $(firstword $(MODEL)): $(RSC_SRC)/$(MODEL_SIZE).config.json $(RSC_SRC)/$(MODEL_SIZE).model.pickle mkdir -p $(RSC_DIR) - PYTHONPATH=$(LIB_DIR) python3 $(BIN_DIR)/compile_model.py --model-size $(MODEL_SIZE) --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR) + PYTHONPATH=$(SRC_PYTHON) python3 $(BIN_DIR)/compile_model.py --model-size $(MODEL_SIZE) --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR) $(wordlist 2,100,$(PREANAL)): $(firstword $(PREANAL)) $(firstword $(PREANAL)): $(RSC_SRC)/preanal.auto $(RSC_SRC)/preanal.manual mkdir -p $(RSC_DIR) - PYTHONPATH=$(LIB_DIR) python3 $(BIN_DIR)/compile_preanal.py --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR) + PYTHONPATH=$(SRC_PYTHON) python3 $(BIN_DIR)/compile_preanal.py --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR) $(wordlist 2,100,$(RESTORE)): $(firstword $(RESTORE)) $(firstword $(RESTORE)): $(RSC_SRC)/restore.dic $(RSC_SRC)/vocab.out $(RSC_SRC)/vocab.out.more mkdir -p $(RSC_DIR) - PYTHONPATH=$(LIB_DIR) python3 $(BIN_DIR)/compile_restore.py --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR) + PYTHONPATH=$(SRC_PYTHON) python3 $(BIN_DIR)/compile_restore.py --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR) $(wordlist 2,100,$(ERRPATCH)): $(firstword $(ERRPATCH)) $(firstword $(ERRPATCH)): $(RSC_SRC)/$(MODEL_SIZE).errpatch.auto $(RSC_SRC)/$(MODEL_SIZE).errpatch.manual mkdir -p $(RSC_DIR) - PYTHONPATH=$(LIB_DIR) python3 $(BIN_DIR)/compile_errpatch.py --model-size $(MODEL_SIZE) --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR) + PYTHONPATH=$(SRC_PYTHON) python3 $(BIN_DIR)/compile_errpatch.py --model-size $(MODEL_SIZE) --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR) clean: rm -rf $(RSC_DIR) diff --git a/rsc/bin/compile_errpatch.py b/rsc/bin/compile_errpatch.py index 34e4486..caf7d8a 100755 --- a/rsc/bin/compile_errpatch.py +++ b/rsc/bin/compile_errpatch.py @@ -5,14 +5,14 @@ """ 오분석 패치를 빌드하는 스크립트 __author__ = 'Jamie (jamie.lim@kakaocorp.com)' -__copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' """ ########### # imports # ########### -import argparse +from argparse import ArgumentParser, Namespace from collections import defaultdict import glob import itertools @@ -20,14 +20,16 @@ import os import struct import sys +from typing import Dict, List, Tuple + +from khaiii.munjong import sejong_corpus +from khaiii.resource.char_align import Aligner, AlignError +from khaiii.resource.morphs import Morph, ParseError +from khaiii.resource.morphs import WORD_DELIM_STR, SENT_DELIM_STR, WORD_DELIM_NUM, SENT_DELIM_NUM +from khaiii.resource.trie import Trie -from char_align import Aligner, AlignError from compile_preanal import align_to_tag, print_errors from compile_restore import load_restore_dic, load_vocab_out -from morphs import Morph, ParseError -from morphs import WORD_DELIM_STR, SENT_DELIM_STR, WORD_DELIM_NUM, SENT_DELIM_NUM -import sejong_corpus -from trie import Trie ######### @@ -37,7 +39,7 @@ class Entry: """ error patch entry """ - def __init__(self, file_path, line_num, line): + def __init__(self, file_path: str, line_num: int, line: str): """ Args: file_path: 파일 경로 @@ -65,7 +67,7 @@ def __str__(self): return '{}: "{}"'.format(file_num, line) return '{}\t{}\t{}'.format(self.raw, Morph.to_str(self.left), Morph.to_str(self.right)) - def key_str(self): + def key_str(self) -> str: """ 패치의 중복 검사를 하기 위해 원문과 left를 이용하여 키를 생성 Returns: @@ -102,7 +104,7 @@ def _parse(self): ############# # functions # ############# -def _split_list(lst, delim): +def _split_list(lst: List[str], delim: str) -> List[List[str]]: """ 리스트를 delimiter로 split하는 함수 @@ -125,7 +127,8 @@ def _split_list(lst, delim): return sublists -def align_patch(rsc_src, raw, morph_str): +def align_patch(rsc_src: Tuple[Aligner, Dict, Dict[str, int]], raw: str, morph_str: str) \ + -> List[int]: """ 패치의 원문과 분석 결과를 음절단위 매핑(정렬)을 수행한다. Args: @@ -171,7 +174,7 @@ def align_patch(rsc_src, raw, morph_str): return tag_nums -def mix_char_tag(chars, tags): +def mix_char_tag(chars: str, tags: List[int]) -> List[int]: """ 음절과 출력 태그를 비트 연산으로 합쳐서 하나의 (32비트) 숫자로 표현한다. Args: @@ -195,11 +198,11 @@ def mix_char_tag(chars, tags): return char_nums -def _load_entries(args): +def _load_entries(args: Namespace) -> List[Entry]: """ 패치 엔트리를 파일로부터 로드한다. Args: - args: arguments + args: program arguments Returns: 엔트리 리스트 """ @@ -221,7 +224,7 @@ def _load_entries(args): return good_entries -def _check_dup(entries): +def _check_dup(entries: List[Entry]): """ 중복된 엔트리가 없는 지 확인한다. Args: @@ -239,7 +242,7 @@ def _check_dup(entries): print_errors(bad_entries) -def _set_align(rsc_src, entries): # pylint: disable=invalid-name +def _set_align(rsc_src: Tuple[Aligner, dict, Dict[str, int]], entries: List[Entry]): """ 음절과 형태소 분석 결과를 정렬한다. Args: @@ -265,7 +268,7 @@ def _set_align(rsc_src, entries): # pylint: disable=invalid-name print_errors(bad_entries) -def _save_trie(rsc_dir, entries): +def _save_trie(rsc_dir: str, entries: List[Entry]): """ 트라이를 저장한다. Args: @@ -309,7 +312,7 @@ def _save_trie(rsc_dir, entries): (sum([len(r) for r in rights])+1) * struct.Struct('h').size) -def run(args): +def run(args: Namespace): """ run function which is the start point of program Args: @@ -338,7 +341,7 @@ def main(): """ main function processes only argument parsing """ - parser = argparse.ArgumentParser(description='기분석 사전을 빌드하는 스크립트') + parser = ArgumentParser(description='기분석 사전을 빌드하는 스크립트') parser.add_argument('--model-size', help='model size ', metavar='SIZE', default='base') parser.add_argument('--rsc-src', help='source directory (text) ', diff --git a/rsc/bin/compile_model.py b/rsc/bin/compile_model.py index fa7b122..1c10d1f 100755 --- a/rsc/bin/compile_model.py +++ b/rsc/bin/compile_model.py @@ -5,15 +5,14 @@ """ compile trained model for C/C++ decoder __author__ = 'Jamie (jamie.lim@kakaocorp.com)' -__copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' """ ########### # imports # ########### -import argparse -from argparse import Namespace +from argparse import ArgumentParser, Namespace import json import logging import os @@ -21,7 +20,7 @@ import pickle from typing import Tuple -from resource import Resource # pylint: disable=wrong-import-order +from khaiii.resource.resource import Resource ############# @@ -40,7 +39,7 @@ def load_cfg_rsc(rsc_src: str, model_size: str) -> Tuple[Namespace, Resource]: file_path = '{}/{}.config.json'.format(rsc_src, model_size) cfg_dic = json.load(open(file_path, 'r', encoding='UTF-8')) logging.info('config: %s', json.dumps(cfg_dic, indent=2)) - cfg = argparse.Namespace() + cfg = Namespace() for key, val in cfg_dic.items(): setattr(cfg, key, val) cwd = os.path.realpath(os.getcwd()) @@ -159,7 +158,7 @@ def main(): """ main function processes only argument parsing """ - parser = argparse.ArgumentParser(description='part-of-speech tagger') + parser = ArgumentParser(description='part-of-speech tagger') parser.add_argument('--model-size', help='model size ', metavar='SIZE', default='base') parser.add_argument('--rsc-src', help='source directory (model) ', diff --git a/rsc/bin/compile_preanal.py b/rsc/bin/compile_preanal.py index 556f5ea..6874d2c 100755 --- a/rsc/bin/compile_preanal.py +++ b/rsc/bin/compile_preanal.py @@ -5,36 +5,38 @@ """ 기분석 사전을 빌드하는 스크립트 __author__ = 'Jamie (jamie.lim@kakaocorp.com)' -__copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' """ ########### # imports # ########### -import argparse +from argparse import ArgumentParser, Namespace from collections import defaultdict import glob import logging import os import struct import sys +from typing import Dict, List, Tuple + +from khaiii.munjong import sejong_corpus +from khaiii.resource.char_align import Aligner, AlignError, MrpChr +from khaiii.resource.morphs import Morph, ParseError +from khaiii.resource.trie import Trie from compile_restore import load_restore_dic, load_vocab_out, append_new_entries -from char_align import Aligner, AlignError, MrpChr -from morphs import Morph, ParseError -import sejong_corpus -from trie import Trie ######### # types # ######### -class Entry(object): +class Entry: """ pre-analyzed dictionary entry """ - def __init__(self, file_path, line_num, line): + def __init__(self, file_path: str, line_num: int, line: str): """ Args: file_path: 파일 경로 @@ -59,7 +61,7 @@ def __str__(self): line = '# {}'.format(self.line) if self.is_sharp else self.line if self.err_msg: return '{}{}: "{}"'.format(file_num, self.err_msg, line) - elif self.is_sharp: + if self.is_sharp: return '{}: "{}"'.format(file_num, line) return '{}{}\t{}'.format(self.word, '*' if self.is_pfx else '', Morph.to_str(self.morphs)) @@ -94,7 +96,7 @@ def _parse(self): ############# # functions # ############# -def print_errors(entries): +def print_errors(entries: List[Entry]): """ 에러가 발생한 엔트리를 출력하고 프로그램을 종료한다. Args: @@ -107,7 +109,7 @@ def print_errors(entries): sys.exit(1) -def _load_entries(args): +def _load_entries(args: Namespace) -> List[Entry]: """ 사전 엔트리를 파일로부터 로드한다. Args: @@ -133,7 +135,7 @@ def _load_entries(args): return good_entries -def _check_dup(entries): +def _check_dup(entries: List[Entry]): """ 중복된 엔트리가 없는 지 확인한다. Args: @@ -151,7 +153,7 @@ def _check_dup(entries): print_errors(bad_entries) -def _set_align(aligner, Word, entries): # pylint: disable=invalid-name +def _set_align(aligner: Aligner, Word: type, entries: List[Entry]): # pylint: disable=invalid-name """ 음절과 형태소 분석 결과를 정렬한다. Args: @@ -173,7 +175,9 @@ def _set_align(aligner, Word, entries): # pylint: disable=invalid-name print_errors(bad_entries) -def align_to_tag(raw_word, alignment, restore, vocab): +def align_to_tag(raw_word: str, alignment: List[List[str]], restore: Tuple[dict, dict], + vocab: Tuple[Dict[str, int], Dict[str, int]]) \ + -> Tuple[List[str], List[int]]: """ 어절의 원문과 정렬 정보를 활용해 음절과 매핑된 태그를 생성한다. Args: @@ -224,7 +228,8 @@ def align_to_tag(raw_word, alignment, restore, vocab): return tag_outs, tag_nums -def _set_tag_out(restore_dic, restore_new, vocab_out, vocab_new, entries): +def _set_tag_out(restore_dic: dict, restore_new: dict, vocab_out: Dict[str, int], + vocab_new: Dict[str, int], entries: List[Entry]): """ 음절 정렬로부터 출력 태그를 결정하고 출력 태그의 번호를 매핑한다. Args: @@ -240,7 +245,7 @@ def _set_tag_out(restore_dic, restore_new, vocab_out, vocab_new, entries): (vocab_out, vocab_new)) -def _save_trie(rsc_dir, entries): +def _save_trie(rsc_dir: str, entries: List[Entry]): """ 트라이를 저장한다. Args: @@ -270,7 +275,7 @@ def _save_trie(rsc_dir, entries): (sum([len(e.tag_nums) for e in entries])+1) * struct.Struct('H').size) -def run(args): +def run(args: Namespace): """ run function which is the start point of program Args: @@ -301,7 +306,7 @@ def main(): """ main function processes only argument parsing """ - parser = argparse.ArgumentParser(description='기분석 사전을 빌드하는 스크립트') + parser = ArgumentParser(description='기분석 사전을 빌드하는 스크립트') parser.add_argument('--rsc-src', help='source directory (text) ', metavar='DIR', default='./src') parser.add_argument('--rsc-dir', help='target directory (binary) ', diff --git a/rsc/bin/compile_restore.py b/rsc/bin/compile_restore.py index f602096..132df3a 100755 --- a/rsc/bin/compile_restore.py +++ b/rsc/bin/compile_restore.py @@ -5,23 +5,22 @@ """ 원형복원 사전을 빌드하는 스크립트 __author__ = 'Jamie (jamie.lim@kakaocorp.com)' -__copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' """ ########### # imports # ########### -from __future__ import print_function - -import argparse +from argparse import ArgumentParser, Namespace from collections import defaultdict import logging import os import struct import sys +from typing import Dict, Tuple -from morphs import TAG_SET +from khaiii.resource.morphs import TAG_SET ############# @@ -33,7 +32,7 @@ ############# # functions # ############# -def load_restore_dic(file_path): +def load_restore_dic(file_path: str) -> Dict[Tuple[str, str], Dict[int, str]]: """ 원형복원 사전을 로드한다. Args: @@ -61,7 +60,7 @@ def load_restore_dic(file_path): return restore_dic -def load_vocab_out(rsc_src): +def load_vocab_out(rsc_src: str) -> Dict[str, int]: """ 출력 태그 vocabulary를 로드한다. Args: @@ -80,7 +79,7 @@ def load_vocab_out(rsc_src): return {tag: idx for idx, tag in enumerate(vocab_out + vocab_out_more, start=1)} -def append_new_entries(rsc_src, restore_new, vocab_new): +def append_new_entries(rsc_src: str, restore_new: dict, vocab_new: Dict[str, int]): """ 기분석 사전 빌드 중에 새로 추가가 필요한 사전 엔트리를 해당 사전에 추가한다. Args: @@ -103,7 +102,7 @@ def append_new_entries(rsc_src, restore_new, vocab_new): print(tag, file=fout) -def _make_bin(restore_dic, vocab_out, vocab_new): +def _make_bin(restore_dic: dict, vocab_out: Dict[str, int], vocab_new: Dict[str, int]) -> dict: """ 두 텍스트 사전을 읽어들여 바이너리 형태의 key-value 사전을 만든다. Args: @@ -140,7 +139,7 @@ def _make_bin(restore_dic, vocab_out, vocab_new): return bin_dic -def _save_restore_dic(rsc_dir, bin_dic): +def _save_restore_dic(rsc_dir: str, bin_dic: dict): """ 원형복원 바이너리 사전을 저장한다. Args: @@ -158,14 +157,13 @@ def _save_restore_dic(rsc_dir, bin_dic): logging.info('restore.val: %d', 4 * sum([len(vals) for vals in bin_dic.values()])) -def _save_restore_one(rsc_dir, vocab_out, vocab_new): +def _save_restore_one(rsc_dir: str, vocab_out: Dict[str, int], vocab_new: Dict[str, int]): """ 출력 태그 번호 별 원형복원을 하지 않는 비복원 사전을 저장한다. Args: rsc_dir: resource directory vocab_out: 출력 태그 사전 vocab_new: 출력 태그 사전에 추가할 새로운 태그 - :return: """ idx_tags = sorted([(idx, tag) for tag, idx in list(vocab_out.items()) + list(vocab_new.items())]) @@ -182,7 +180,7 @@ def _save_restore_one(rsc_dir, vocab_out, vocab_new): logging.info('restore.one: %d', 1 + len(idx_tags)) -def run(args): +def run(args: Namespace): """ run function which is the start point of program Args: @@ -210,7 +208,7 @@ def main(): """ main function processes only argument parsing """ - parser = argparse.ArgumentParser(description='기분석 사전을 빌드하는 스크립트') + parser = ArgumentParser(description='기분석 사전을 빌드하는 스크립트') parser.add_argument('--rsc-src', help='source directory (text) ', metavar='DIR', default='./src') parser.add_argument('--rsc-dir', help='target directory (binary) ', diff --git a/src/main/python/khaiii/__init__.py b/src/main/python/khaiii/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/main/python/khaiii/khaiii.py b/src/main/python/khaiii/khaiii.py index 5f174e2..4bdb193 100644 --- a/src/main/python/khaiii/khaiii.py +++ b/src/main/python/khaiii/khaiii.py @@ -12,8 +12,7 @@ ########### # imports # ########### -import argparse -from argparse import Namespace +from argparse import ArgumentParser, Namespace import ctypes from ctypes.util import find_library import logging @@ -26,11 +25,10 @@ ######### # types # ######### -class _khaiii_morph_t(ctypes.Structure): # pylint: disable=invalid-name +class _khaiii_morph_t(ctypes.Structure): # pylint: disable=invalid-name,too-few-public-methods """ khaiii_morph_t structure """ - pass _khaiii_morph_t._fields_ = [ # pylint: disable=protected-access @@ -43,11 +41,10 @@ class _khaiii_morph_t(ctypes.Structure): # pylint: disable=invalid-name ] -class _khaiii_word_t(ctypes.Structure): # pylint: disable=invalid-name +class _khaiii_word_t(ctypes.Structure): # pylint: disable=invalid-name,too-few-public-methods """ khaiii_word_t structure """ - pass _khaiii_word_t._fields_ = [ # pylint: disable=protected-access @@ -63,7 +60,6 @@ class KhaiiiExcept(Exception): """ khaiii API를 위한 표준 예외 클래스 """ - pass class KhaiiiMorph: @@ -384,7 +380,7 @@ def main(): """ main function processes only argument parsing """ - parser = argparse.ArgumentParser(description='khaiii API module test program') + parser = ArgumentParser(description='khaiii API module test program') parser.add_argument('--lib-path', help='library path', metavar='FILE', default='') parser.add_argument('--rsc-dir', help='resource directory', metavar='DIR', default='') parser.add_argument('--opt-str', help='option string (JSON format)', metavar='JSON', default='') diff --git a/src/main/python/khaiii/munjong/__init__.py b/src/main/python/khaiii/munjong/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rsc/lib/sejong_corpus.py b/src/main/python/khaiii/munjong/sejong_corpus.py similarity index 76% rename from rsc/lib/sejong_corpus.py rename to src/main/python/khaiii/munjong/sejong_corpus.py index c8b9681..fb19fe3 100755 --- a/rsc/lib/sejong_corpus.py +++ b/src/main/python/khaiii/munjong/sejong_corpus.py @@ -5,19 +5,19 @@ """ Sejong corpus parser __author__ = 'Jamie (jamie.lim@kakaocorp.com)' -__copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' """ ########### # imports # ########### -import argparse -import codecs +from argparse import ArgumentParser import logging import os import re import sys +from typing import Iterator, TextIO from unicodedata import normalize as norm @@ -45,7 +45,7 @@ _OPEN_TAGS_IN_WRITTEN_SENT = {'', } # open tags in sentence at written corpus # close tags in sentence at written corpus -_CLOSE_TAGS_IN_WRITTEN_SENT = set([tag[0] + '/' + tag[1:] for tag in _OPEN_TAGS_IN_WRITTEN_SENT]) +_CLOSE_TAGS_IN_WRITTEN_SENT = {tag[0] + '/' + tag[1:] for tag in _OPEN_TAGS_IN_WRITTEN_SENT} # open and close tags in sentence at written corpus _TAGS_IN_WRITTEN_SENT = _OPEN_TAGS_IN_WRITTEN_SENT | _CLOSE_TAGS_IN_WRITTEN_SENT @@ -58,84 +58,67 @@ class ParseError(Exception): """ error occurred while parsing corpus """ - pass -class Sentence(object): +class Morph: # pylint: disable=too-few-public-methods """ - sentence + morpheme """ - def __init__(self): - self.words = [] # word list - self._wids = set() # word ID set + def __init__(self, lex: str = '', tag: str = ''): + if ' ' in lex: + raise ParseError('space in raw morph: %s' % lex) + self.lex = lex # lexical form + self.tag = tag # part-of-speech tag def __str__(self): - words_str = '\n'.join([str(_) for _ in self.words]) - return '# %s\n%s' % (self.raw_str(), words_str) - - def raw_str(self): - """ - raw sentence (words) - :return: raw sentence - """ - return ' '.join([_.raw for _ in self.words]) - - def morph_str(self): - """ - make morpheme string - :return: morpheme string - """ - return ' + '.join([_.morph_str() for _ in self.words]) + return '{}/{}'.format(self.lex, self.tag) - @classmethod - def is_opening(cls, line): - """ - whether sentence opening or not - :param line: line - :return: whether opening or not - """ - return line in SENT_OPEN_TAGS + def __eq__(self, other: 'Morph'): + return self.lex == other.lex and self.tag == other.tag - @classmethod - def is_closing(cls, line): - """ - whether sentence closing or not - :param line: line - :return: whether closing or not - """ - return line in SENT_CLOSE_TAGS + def __hash__(self): + return hash(str(self)) @classmethod - def is_tag_in_sent(cls, line): - """ - whether tag in sentence or not - :param line: line - :return: whether tag or not + def parse(cls, token_str: str, file_name: str = '', line_num: int = 0): """ - return line in _TAGS_IN_WRITTEN_SENT - - def is_good_tags(self): - """ - whether all tags in sentence are good(correct) or not - :return: whether all tags are good + parse token string + Args: + token_str: morpheme/tag string + file_name: file name + line_num: line number + Returns: + Morph object """ - return not [morph.tag for word in self.words for morph in word.morphs - if morph.tag not in TAG_SET] + def _raise(err_msg: str, file_name: str = None, line_num: int = 0): + """ + 에러 메세지를 이용해 파싱 에러를 발생시킨다. + Arguments: + err_msg: 에러 메세지 + file_name: 파일명 + line_num: 라인 번호 + """ + file_pfx = '{}({}) '.format(file_name, line_num) if file_name and line_num else '' + raise ParseError('{}{}'.format(file_pfx, err_msg)) - def append(self, word): - """ - append word - :param word: Word object - """ - if word.wid and word.wid in self._wids: - raise ParseError('duplicated word ID: %s' % word) - self.words.append(word) - self._wids.add(word.wid) + morph = Morph() + morph.lex, morph.tag = token_str.rsplit('/', 1) + if not morph.lex: + _raise('no text in morpheme: {}'.format(token_str), file_name, line_num) + if ' ' in morph.lex: + _raise('space in raw morph: {}'.format(token_str), file_name, line_num) + if morph.tag not in TAG_SET: + _raise('invalid tag: {} in {}'.format(morph.tag, token_str), file_name, line_num) + for char in morph.lex: + if 0x1100 <= ord(char) < 0x1200: + _raise('Hangul Jamo character: "{}" in {}'.format(char, token_str), file_name, + line_num) + return morph -class Word(object): +class Word: """ - word(EoJeol) + word(어절) """ def __init__(self): self.wid = '' # word ID @@ -143,10 +126,9 @@ def __init__(self): self.morphs = [] # morpheme list def __str__(self): - return '%s\t%s\t%s' % (self.wid, self.raw, - ' + '.join([str(morph) for morph in self.morphs])) + return '{}\t{}\t{}'.format(self.wid, self.raw, ' + '.join([str(m) for m in self.morphs])) - def __eq__(self, other): + def __eq__(self, other: 'Word'): if self.raw != other.raw: return False if len(self.morphs) != len(other.morphs): @@ -156,20 +138,24 @@ def __eq__(self, other): return False return True - def morph_str(self): + def morph_str(self) -> str: """ make morpheme string - :return: morpheme string + Returns: + morpheme string """ return ' + '.join([str(_) for _ in self.morphs]) @classmethod - def parse(cls, line, file_name, line_num): + def parse(cls, line: str, file_name: str, line_num: int) -> 'Word': """ parse word(EoJeol) with single line - :param line: line - :param file_name: file name - :param line_num: line number + Args: + line: line + file_name: file name + line_num: line number + Returns: + Word object """ cols = line.split('\t') if len(cols) != 3: @@ -192,68 +178,98 @@ def parse(cls, line, file_name, line_num): return word -class Morph(object): # pylint: disable=too-few-public-methods +class Sentence: """ - morpheme + sentence """ - def __init__(self, lex='', tag=''): - if ' ' in lex: - raise ParseError('space in raw morph: %s' % lex) - self.lex = lex # lexical form - self.tag = tag # part-of-speech tag + def __init__(self): + self.words = [] # word list + self._wids = set() # word ID set def __str__(self): - return '%s/%s' % (self.lex, self.tag) + words_str = '\n'.join([str(_) for _ in self.words]) + return '# %s\n%s' % (self.raw_str(), words_str) - def __eq__(self, other): - return self.lex == other.lex and self.tag == other.tag + def raw_str(self) -> str: + """ + raw sentence (words) + Returns: + raw sentence + """ + return ' '.join([_.raw for _ in self.words]) - def __hash__(self): - return hash(str(self)) + def morph_str(self) -> str: + """ + make morpheme string + Returns: + morpheme string + """ + return ' + '.join([_.morph_str() for _ in self.words]) @classmethod - def parse(cls, token_str, file_name='', line_num=0): + def is_opening(cls, line: str) -> bool: """ - parse token string - :param token_str: morpheme/tag string - :param file_name: file name - :param line_num: line number - :return: Morph object + whether sentence opening or not + Args: + line: line + Returns: + whether opening or not """ - def _raise(err_msg, file_name=None, line_num=0): - """ - 에러 메세지를 이용해 파싱 에러를 발생시킨다. - Arguments: - err_msg: 에러 메세지 - file_name: 파일명 - line_num: 라인 번호 - """ - file_pfx = '{}({}) '.format(file_name, line_num) if file_name and line_num else '' - raise ParseError('{}{}'.format(file_pfx, err_msg)) + return line in SENT_OPEN_TAGS - morph = Morph() - morph.lex, morph.tag = token_str.rsplit('/', 1) - if not morph.lex: - _raise('no text in morpheme: {}'.format(token_str), file_name, line_num) - if ' ' in morph.lex: - _raise('space in raw morph: {}'.format(token_str), file_name, line_num) - if morph.tag not in TAG_SET: - _raise('invalid tag: {} in {}'.format(morph.tag, token_str), file_name, line_num) - for char in morph.lex: - if 0x1100 <= ord(char) < 0x1200: - _raise('Hangul Jamo character: "{}" in {}'.format(char, token_str), file_name, - line_num) - return morph + @classmethod + def is_closing(cls, line: str) -> bool: + """ + whether sentence closing or not + Args: + line: line + Returns: + whether closing or not + """ + return line in SENT_CLOSE_TAGS + + @classmethod + def is_tag_in_sent(cls, line: str) -> bool: + """ + whether tag in sentence or not + Args: + line: line + Returns: + whether tag or not + """ + return line in _TAGS_IN_WRITTEN_SENT + + def is_good_tags(self) -> bool: + """ + whether all tags in sentence are good(correct) or not + Returns: + whether all tags are good + """ + return not [morph.tag for word in self.words for morph in word.morphs + if morph.tag not in TAG_SET] + + def append(self, word: Word): + """ + append word + Args: + word: Word object + """ + if word.wid and word.wid in self._wids: + raise ParseError('duplicated word ID: %s' % word) + self.words.append(word) + self._wids.add(word.wid) ############# # functions # ############# -def sents(fin): +def sents(fin: TextIO) -> Iterator[Sentence]: """ load from file and return sentences (generator) - :param fin: input file object - :yeild: Sentence object + Args: + fin: input file object + Yields: + Sentence object """ file_name = os.path.basename(fin.name) par_errs = [] @@ -292,14 +308,10 @@ def sents(fin): def run(): """ - load Sejong corpus and print + run function which is the start point of program """ - try: - for sent in sents(sys.stdin): - print(sent) - except ParseError as par_err: - logging.error(par_err) - sys.exit(1) + for sent in sents(sys.stdin): + print(sent) ######## @@ -307,18 +319,18 @@ def run(): ######## def main(): """ - load Sejong corpus and print + main function processes only argument parsing """ - parser = argparse.ArgumentParser(description='load Sejong corpus and print') + parser = ArgumentParser(description='load Sejong corpus and print') parser.add_argument('--input', help='input file ', metavar='FILE') parser.add_argument('--output', help='output file ', metavar='FILE') parser.add_argument('--debug', help='enable debug', action='store_true') args = parser.parse_args() if args.input: - sys.stdin = codecs.open(args.input, 'r', encoding='UTF-8') + sys.stdin = open(args.input, 'r', encoding='UTF-8') if args.output: - sys.stdout = codecs.open(args.output, 'w', encoding='UTF-8') + sys.stdout = open(args.output, 'w', encoding='UTF-8') if args.debug: logging.basicConfig(level=logging.DEBUG) else: diff --git a/src/main/python/khaiii/resource/__init__.py b/src/main/python/khaiii/resource/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rsc/lib/char_align.py b/src/main/python/khaiii/resource/char_align.py similarity index 90% rename from rsc/lib/char_align.py rename to src/main/python/khaiii/resource/char_align.py index 65af965..1682606 100644 --- a/rsc/lib/char_align.py +++ b/src/main/python/khaiii/resource/char_align.py @@ -4,29 +4,71 @@ """ 형태소 분석 결과와 원문의 음절을 정렬하는 모듈 __author__ = 'Jamie (jamie.lim@kakaocorp.com)' -__copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' """ ########### # imports # ########### -import codecs from collections import Counter, defaultdict import logging import os +from typing import List, Tuple -import jaso +from khaiii.munjong.sejong_corpus import Word +from khaiii.resource import jaso +from khaiii.resource.morphs import Morph ######### # types # ######### -class Aligner(object): +class MrpChr: # pylint: disable=too-few-public-methods + """ + 음절과 태그 pair + """ + def __init__(self, char: str, tag: str): + """ + Args: + char: 음절 + tag: 태그 + """ + self.char = char + self.tag = tag + + def __str__(self): + return '{}/{}'.format(self.char, self.tag) + + def __hash__(self): + return hash(str(self)) + + def __eq__(self, other: 'MrpChr'): + """ + Args: + other: 다른 객체 + Returns: + 같을 경우 True + """ + return self.char == other.char and self.tag == other.tag + + @classmethod + def to_str(cls, mrp_chrs: List['MrpChr']): + """ + MrpChr 객체 리스트를 문자열로 변환하는 메소드 + Args: + mrp_chrs: MrpChr 객체 리스트 + Returns: + 변환된 문자열 + """ + return ' '.join([str(m) for m in mrp_chrs]) + + +class Aligner: """ 음절과 형태소 분석 결과의 정렬을 수행하는 클래스 """ - def __init__(self, rsc_src): + def __init__(self, rsc_src: str): """ 리소스를 오픈하고 초기화한다. Args: @@ -37,7 +79,7 @@ def __init__(self, rsc_src): self.middle_unmapped = defaultdict(Counter) self._open(rsc_src) - def align(self, word): + def align(self, word: Word) -> List[List[MrpChr]]: """ 어절의 원문과 분석 결과를 음절 단위로 정렬(매핑)한다. Args: @@ -78,7 +120,7 @@ def print_middle_cnt(self): logging.info('total number of unmapped pairs: %d', sum([sum(cnt.values()) for cnt in self.middle_unmapped.values()])) - def _open(self, rsc_dir): + def _open(self, rsc_dir: str): """ initialize resources Args: @@ -86,7 +128,7 @@ def _open(self, rsc_dir): """ file_path = '{}/char_align.map'.format(rsc_dir) file_name = os.path.basename(file_path) - for line_num, line in enumerate(codecs.open(file_path, 'r', encoding='UTF-8'), start=1): + for line_num, line in enumerate(open(file_path, 'r', encoding='UTF-8'), start=1): line = line.rstrip('\r\n') if not line or line[0] == '#': continue @@ -105,7 +147,7 @@ def _open(self, rsc_dir): self.align_map[key] = map_nums @classmethod - def _get_morph_raw(cls, word): + def _get_morph_raw(cls, word: Word) -> str: """ get raw string from morphemes Args: @@ -116,7 +158,7 @@ def _get_morph_raw(cls, word): return ''.join([m.lex for m in word.morphs]) @classmethod - def _norm(cls, text): + def _norm(cls, text: str) -> str: """ unicode normalization of text Args: @@ -127,7 +169,7 @@ def _norm(cls, text): return jaso.decompose(text) @classmethod - def _align_phoneme(cls, raw_word, mrp_chrs): + def _align_phoneme(cls, raw_word: str, mrp_chrs: List[MrpChr]) -> List[List[MrpChr]]: """ align word with morpheme which is same phoneme Args: @@ -163,7 +205,7 @@ def _align_phoneme(cls, raw_word, mrp_chrs): return maps @classmethod - def _align_forward(cls, raw_word, mrp_chrs): + def _align_forward(cls, raw_word: str, mrp_chrs: List[MrpChr]) -> Tuple[int, int]: """ align from front of word Args: @@ -192,7 +234,7 @@ def _align_forward(cls, raw_word, mrp_chrs): return word_idx, mrp_chrs_idx @classmethod - def _align_backward(cls, raw_word, mrp_chrs): + def _align_backward(cls, raw_word: str, mrp_chrs: List[MrpChr]) -> Tuple[int, int]: """ align from back of word Args: @@ -226,7 +268,7 @@ def _align_backward(cls, raw_word, mrp_chrs): return word_idx+1, mrp_chrs_idx+1 @classmethod - def _is_verb_ending(cls, verb, ending): + def _is_verb_ending(cls, verb: Morph, ending: Morph) -> bool: """ whether is verb + ending pattern or not Args: @@ -241,7 +283,7 @@ def _is_verb_ending(cls, verb, ending): ending_tag in {'EC', 'EP', 'EF', 'ETN', 'ETM'} @classmethod - def _are_first_last_phoneme_same(cls, raw_word, mrp_chrs): + def _are_first_last_phoneme_same(cls, raw_word: str, mrp_chrs: List[MrpChr]) -> bool: """ whether are same the first phoneme and last phoneme Args: @@ -255,7 +297,7 @@ def _are_first_last_phoneme_same(cls, raw_word, mrp_chrs): return word_norm[0] == morph_norm[0] and word_norm[-1] == morph_norm[-1] @classmethod - def _is_ah_ending_verb(cls, mrp_chr): + def _is_ah_ending_verb(cls, mrp_chr: MrpChr) -> bool: """ whether 'ㅏ' ending verb or not Args: @@ -269,7 +311,7 @@ def _is_ah_ending_verb(cls, mrp_chr): return len(norm_char) == 2 and norm_char[1] == 'ᅡ' # code is 4449, not 12623 @classmethod - def _is_eo_ending_verb(cls, mrp_chr): + def _is_eo_ending_verb(cls, mrp_chr: MrpChr) -> bool: """ whether 'ㅓ', 'ㅐ' ending verb or not Args: @@ -284,7 +326,8 @@ def _is_eo_ending_verb(cls, mrp_chr): norm_char[1] in ['ᅥ', 'ᅧ', 'ᅢ']) # code is 4453, 4455, 4450 @classmethod - def _align_middle_zero2one(cls, pfx_word, pfx_map, mdl_mrp_chrs, sfx_word, sfx_map): + def _align_middle_zero2one(cls, pfx_word: Word, pfx_map: List[MrpChr], + mdl_mrp_chrs: List[MrpChr], sfx_word: Word, sfx_map: List[MrpChr]): """ align middle chunks after forward/backward aligning which has no middle raw character, but has a remaining middle single morpheme character @@ -322,7 +365,7 @@ def _attach_to_pfx(): raise RuntimeError('nowhere attach to') @classmethod - def _is_share_phoneme(cls, mdl_word, mdl_mrp_chrs): + def _is_share_phoneme(cls, mdl_word: str, mdl_mrp_chrs: List[MrpChr]) -> bool: """ whether middle word characters and morpheme characters share same phoneme Args: @@ -342,7 +385,7 @@ def _is_share_phoneme(cls, mdl_word, mdl_mrp_chrs): return False return True - def _align_middle_by_dic(self, mdl_word, mdl_mrp_chrs): + def _align_middle_by_dic(self, mdl_word: str, mdl_mrp_chrs: List[MrpChr]) -> List[List[MrpChr]]: """ align middle chunks after forward/backward aligning with mapping dictionary Args: @@ -364,7 +407,8 @@ def _align_middle_by_dic(self, mdl_word, mdl_mrp_chrs): idx += int(map_num) return maps - def _align_middle(self, mdl_word, mdl_mrp_chrs, raw_word, mrp_chrs): + def _align_middle(self, mdl_word: str, mdl_mrp_chrs: List[MrpChr], raw_word: str, + mrp_chrs: List[MrpChr]) -> List[List[MrpChr]]: """ align middle chunks after forward/backward aligning Args: @@ -417,7 +461,9 @@ def _align_middle(self, mdl_word, mdl_mrp_chrs, raw_word, mrp_chrs): return maps @classmethod - def _align_middle_preproc(cls, pfx_mrp_chrs, pfx_map, mdl_mrp_chrs, sfx_mrp_chrs, sfx_map): + def _align_middle_preproc(cls, pfx_mrp_chrs: List[MrpChr], pfx_map: List[List[MrpChr]], + mdl_mrp_chrs: List[MrpChr], sfx_mrp_chrs: List[MrpChr], + sfx_map: List[List[MrpChr]]): """ pre-processing middle part after forward/backward mapping before applying rules Args: @@ -454,7 +500,7 @@ def _align_middle_preproc(cls, pfx_mrp_chrs, pfx_map, mdl_mrp_chrs, sfx_mrp_chrs sfx_map[0].insert(0, mdl_mrp_chrs[-1]) del mdl_mrp_chrs[-1] - def _get_pfx_mdl_sfx(self, raw_word, mrp_chrs): + def _get_pfx_mdl_sfx(self, raw_word: str, mrp_chrs: List[MrpChr]) -> Tuple: """ get prefix, middle, suffix after forward/backward align Args: @@ -481,7 +527,7 @@ def _get_pfx_mdl_sfx(self, raw_word, mrp_chrs): (pfx_mrp_chrs, mdl_mrp_chrs, sfx_mrp_chrs), \ (pfx_map, sfx_map) - def _align_forward_backward(self, raw_word, mrp_chrs): + def _align_forward_backward(self, raw_word: str, mrp_chrs: List[MrpChr]) -> List[str]: """ align word with morpheme which is same phoneme Args: @@ -497,7 +543,7 @@ def _align_forward_backward(self, raw_word, mrp_chrs): if not mdl_word and not mdl_mrp_chrs: return pfx_map + sfx_map - elif not mdl_word: + if not mdl_word: if len(mdl_mrp_chrs) == 1: self._align_middle_zero2one(pfx_word, pfx_map, mdl_mrp_chrs, sfx_word, sfx_map) return pfx_map + sfx_map @@ -529,7 +575,7 @@ class AlignError(Exception): """ 음절 정렬 과정에서 나타나는 예외 """ - def __init__(self, pfx): + def __init__(self, pfx: str): """ Args: pfx: 예외 출력 시 보여줄 prefix (카테고리) @@ -541,50 +587,10 @@ def __init__(self, pfx): def __str__(self): return '\n'.join(['%s %s' % (self._pfx, _) for _ in self._msgs] + ['', ]) - def add_msg(self, msg): + def add_msg(self, msg: str): """ 메세제를 추가한다. Args: msg: 에러 메세지 """ self._msgs.append(msg) - - -class MrpChr(object): # pylint: disable=too-few-public-methods - """ - 음절과 태그 pair - """ - def __init__(self, char, tag): - """ - Args: - char: 음절 - tag: 태그 - """ - self.char = char - self.tag = tag - - def __str__(self): - return '%s/%s' % (self.char, self.tag) - - def __hash__(self): - return hash(str(self)) - - def __eq__(self, other): - """ - Args: - other: 다른 객체 - Returns: - 같을 경우 True - """ - return self.char == other.char and self.tag == other.tag - - @classmethod - def to_str(cls, mrp_chrs): - """ - MrpChr 객체 리스트를 문자열로 변환하는 메소드 - Args: - mrp_chrs: MrpChr 객체 리스트 - Returns: - 변환된 문자열 - """ - return ' '.join([str(_) for _ in mrp_chrs]) diff --git a/rsc/lib/jaso.py b/src/main/python/khaiii/resource/jaso.py similarity index 92% rename from rsc/lib/jaso.py rename to src/main/python/khaiii/resource/jaso.py index a93a64a..1f5e184 100644 --- a/rsc/lib/jaso.py +++ b/src/main/python/khaiii/resource/jaso.py @@ -4,10 +4,16 @@ """ 한글 자소 관련 유틸리티 모듈 __author__ = 'Jamie (jamie.lim@kakaocorp.com)' -__copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' """ +########### +# imports # +########### +from typing import Tuple + + ############# # constants # ############# @@ -31,7 +37,7 @@ ############# # functions # ############# -def _decomp_char(char): +def _decomp_char(char: str) -> Tuple[str, str, str]: """ 한글 음절 하나를 자소로 분해한다. Args: diff --git a/rsc/lib/morphs.py b/src/main/python/khaiii/resource/morphs.py similarity index 87% rename from rsc/lib/morphs.py rename to src/main/python/khaiii/resource/morphs.py index 69184f1..be6d232 100644 --- a/rsc/lib/morphs.py +++ b/src/main/python/khaiii/resource/morphs.py @@ -2,12 +2,19 @@ """ -형태소 분석 결과를 기술한 문자열을 파싱하는 모듈 +형태소 분석 결과를 기술한 문자열을 파싱하는 모듈. +TODO(jamie): sejong_corpus 모듈의 Morph 클래스와 중복되므로 정리 필요 __author__ = 'Jamie (jamie.lim@kakaocorp.com)' -__copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' """ +########### +# imports # +########### +from typing import List + + ############# # constants # ############# @@ -36,14 +43,13 @@ class ParseError(Exception): """ 형태소 분석 결과 문자열을 파싱하면서 발생하는 오류 """ - pass -class Morph(object): +class Morph: """ 형태소 """ - def __init__(self, lex, tag): + def __init__(self, lex: str, tag: str): """ Arguments: lex: 형태소(어휘) @@ -57,7 +63,7 @@ def __str__(self): return self.lex return '{}/{}'.format(self.lex, self.tag) - def is_word_delim(self): + def is_word_delim(self) -> bool: """ 어절의 경계를 나타태는 지 여부 Returns: @@ -65,7 +71,7 @@ def is_word_delim(self): """ return not self.tag and self.lex == WORD_DELIM_STR - def is_sent_delim(self): + def is_sent_delim(self) -> bool: """ 문장의 경계를 나타태는 지 여부 Returns: @@ -74,7 +80,7 @@ def is_sent_delim(self): return not self.tag and self.lex == SENT_DELIM_STR @classmethod - def to_str(cls, morphs): + def to_str(cls, morphs: List['Morph']) -> str: """ Morph 객체 리스트를 문자열로 변환한다. Arguments: @@ -85,7 +91,7 @@ def to_str(cls, morphs): return ' + '.join([str(m) for m in morphs]) @classmethod - def parse(cls, morphs_str): + def parse(cls, morphs_str: str) -> List['Morph']: """ 형태소 분석 결과 형태의 문자열을 파싱하여 Morph 객체 리스트를 반환하는 파싱 함수 Arguments: @@ -98,7 +104,7 @@ def parse(cls, morphs_str): return [cls._parse_one(m) for m in morphs_str.split(' + ')] @classmethod - def _parse_one(cls, morph_str): + def _parse_one(cls, morph_str: str) -> 'Morph': """ 하나의 형태소 객체를 기술한 문자열을 파싱한다. Arguments: diff --git a/rsc/lib/resource.py b/src/main/python/khaiii/resource/resource.py similarity index 72% rename from rsc/lib/resource.py rename to src/main/python/khaiii/resource/resource.py index a2ebc9e..3514e81 100644 --- a/rsc/lib/resource.py +++ b/src/main/python/khaiii/resource/resource.py @@ -4,18 +4,19 @@ """ resources for training and tagging __author__ = 'Jamie (jamie.lim@kakaocorp.com)' -__copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' """ ########### # imports # ########### -import codecs +from argparse import Namespace import logging import os +from typing import Dict -from vocabulary import Vocabulary +from khaiii.resource.vocabulary import Vocabulary ############# @@ -33,13 +34,14 @@ ######### # types # ######### -class Resource(object): +class Resource: """ resources """ - def __init__(self, cfg): + def __init__(self, cfg: Namespace): """ - :param cfg: config + Args: + cfg: config """ vocab_in_path = '{}/vocab.in'.format(cfg.rsc_src) self.vocab_in = Vocabulary(vocab_in_path, cfg.cutoff, SPECIAL_CHARS) @@ -49,14 +51,16 @@ def __init__(self, cfg): self.restore_dic = self._load_restore_dic(restore_dic_path) @classmethod - def _load_restore_dic(cls, path): + def _load_restore_dic(cls, path: str) -> Dict[str, str]: """ load character to output tag mapping - :param path: file path - :return: dictionary + Args: + path: file path + Returns: + dictionary """ dic = {} - for line in codecs.open(path, 'r', encoding='UTF-8'): + for line in open(path, 'r', encoding='UTF-8'): line = line.rstrip('\r\n') if not line: continue diff --git a/rsc/lib/trie.py b/src/main/python/khaiii/resource/trie.py similarity index 90% rename from rsc/lib/trie.py rename to src/main/python/khaiii/resource/trie.py index b380ee9..f843285 100755 --- a/rsc/lib/trie.py +++ b/src/main/python/khaiii/resource/trie.py @@ -5,18 +5,18 @@ """ TRIE 모듈 __author__ = 'Jamie (jamie.lim@kakaocorp.com)' -__copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' """ ########### # imports # ########### -import argparse -import codecs +from argparse import ArgumentParser, Namespace import logging import struct import sys +from typing import List ############# @@ -28,14 +28,73 @@ ######### # types # ######### -class Trie(object): +class Node: + """ + TRIE 노드 + """ + def __init__(self, char: str = ''): + """ + Args: + char: (유니코드) 입력 문자 + """ + self.depth = -1 # 노드를 배열로 펼치기 위해 필요한 현재 노드의 루트로부터의 깊이 + self.char = char # 입력 문자 + self.value = 0 # 단말 노드일 경우 그 값 (단말 노드가 아닌 경우 0) + self.children = {} # 자식 노드들 + self.child_start = -1 # 현재 노드로부터 자식 노드들의 시작점 사이에 있는 노드의 갯수 + + def __str__(self): + node_str = self.char if self.char else 'ROOT' + if self.value > 0: + node_str = '{%s: %d}' % (node_str, self.value) + pfx = '' + if self.depth > 0: + pfx = '·' * self.depth + child_start_str = '' + if self.child_start > 0: + child_start_str = ', {}'.format(self.child_start) + return '{}({}, [{}]{})'.format(pfx, node_str, ', '.join(self.children.keys()), + child_start_str) + + def insert(self, key: str, val: int): + """ + 문자열 키와 그 값을 현재 노드로부터 내려가며 적절한 위치에 삽입한다. + Args: + key: 키 문자열 + val: 값 (양수) + """ + if val <= 0: + raise ValueError('value must be greater than zero') + if not key: + self.value = val + elif key[0] in self.children: + self.children[key[0]].insert(key[1:], val) + else: + new_node = Node(key[0]) + self.children[key[0]] = new_node + new_node.insert(key[1:], val) + logging.debug('INSERT {%s: %d} INTO %s', key, val, self) + + def pack(self) -> bytes: + """ + 구조체로 packing한다. + Returns: + packing된 구조체 + """ + char = 0 if not self.char else self.char + if isinstance(char, str): + char = ord(char) + return _NODE_STRUCT.pack(char, self.value, self.child_start, len(self.children)) + + +class Trie: """ TRIE 인터페이스 """ def __init__(self): self.root = Node() - def insert(self, key, val): + def insert(self, key: str, val: int): """ 하나의 문자열 키와 그 값을 삽입한다. Args: @@ -44,7 +103,7 @@ def insert(self, key, val): """ self.root.insert(key, val) - def update(self, keys, vals=None): + def update(self, keys: List[str], vals: List[int] = None): """ 문자열 키와 그 값의 리스트를 차례로 삽입한다. 값이 없을 경우 키 목록의 인덱스(1부터 시작)를 값으로 설정한다. @@ -59,7 +118,7 @@ def update(self, keys, vals=None): for key, val in zip(keys, vals): self.insert(key, val) - def save(self, file_path): + def save(self, file_path: str): """ TRIE 자료구조를 파일로 저장한다. Args: @@ -74,7 +133,7 @@ def save(self, file_path): logging.info('total nodes: %d', len(nodes)) logging.info('expected size: %d', len(nodes) * _NODE_STRUCT.size) - def find(self, key): + def find(self, key: str) -> int: """ 키를 이용하여 값을 얻는다. Args: @@ -86,11 +145,10 @@ def find(self, key): for char in key: if char not in node.children: return 0 - else: - node = node.children[char] + node = node.children[char] return node.value - def _breadth_first_traverse(self): + def _breadth_first_traverse(self) -> List[Node]: """ 너비우선으로 루트로부터 전체 노드를 탐색한다. Returns: @@ -113,7 +171,7 @@ def _breadth_first_traverse(self): return nodes @classmethod - def _set_child_start(cls, nodes): + def _set_child_start(cls, nodes: List[Node]): """ child_start 필드를 세팅한다. Args: @@ -135,69 +193,10 @@ def _set_child_start(cls, nodes): else -1 -class Node(object): - """ - TRIE 노드 - """ - def __init__(self, char=''): - """ - Args: - char: (유니코드) 입력 문자 - """ - self.depth = -1 # 노드를 배열로 펼치기 위해 필요한 현재 노드의 루트로부터의 깊이 - self.char = char # 입력 문자 - self.value = 0 # 단말 노드일 경우 그 값 (단말 노드가 아닌 경우 0) - self.children = {} # 자식 노드들 - self.child_start = -1 # 현재 노드로부터 자식 노드들의 시작점 사이에 있는 노드의 갯수 - - def __str__(self): - node_str = self.char if self.char else 'ROOT' - if self.value > 0: - node_str = '{%s: %d}' % (node_str, self.value) - pfx = '' - if self.depth > 0: - pfx = '·' * self.depth - child_start_str = '' - if self.child_start > 0: - child_start_str = ', {}'.format(self.child_start) - return '{}({}, [{}]{})'.format(pfx, node_str, ', '.join(self.children.keys()), - child_start_str) - - def insert(self, key, val): - """ - 문자열 키와 그 값을 현재 노드로부터 내려가며 적절한 위치에 삽입한다. - Args: - key: 키 문자열 - val: 값 (양수) - """ - if val <= 0: - raise ValueError('value must be greater than zero') - if not key: - self.value = val - elif key[0] in self.children: - self.children[key[0]].insert(key[1:], val) - else: - new_node = Node(key[0]) - self.children[key[0]] = new_node - new_node.insert(key[1:], val) - logging.debug('INSERT {%s: %d} INTO %s', key, val, self) - - def pack(self): - """ - 구조체로 packing한다. - Returns: - packing된 구조체 - """ - char = 0 if not self.char else self.char - if isinstance(char, str): - char = ord(char) - return _NODE_STRUCT.pack(char, self.value, self.child_start, len(self.children)) - - ############# # functions # ############# -def run(args): +def run(args: Namespace): """ run function which is the start point of program Args: @@ -233,7 +232,7 @@ def main(): """ main function processes only argument parsing """ - parser = argparse.ArgumentParser(description='트라이를 빌드합니다.') + parser = ArgumentParser(description='트라이를 빌드합니다.') parser.add_argument('-o', '--output', help='output file', metavar='FILE', required=True) parser.add_argument('--input', help='input file ', metavar='FILE') parser.add_argument('--val', dest='has_val', help='탭으로 구분된 마지막 컬럼이 값일 경우', @@ -242,9 +241,7 @@ def main(): args = parser.parse_args() if args.input: - sys.stdin = codecs.open(args.input, 'r', encoding='UTF-8') - else: - sys.stdin = codecs.getreader('UTF-8')(sys.stdin.detach()) + sys.stdin = open(args.input, 'r', encoding='UTF-8') if args.debug: logging.basicConfig(level=logging.DEBUG) else: diff --git a/rsc/lib/vocabulary.py b/src/main/python/khaiii/resource/vocabulary.py similarity index 89% rename from rsc/lib/vocabulary.py rename to src/main/python/khaiii/resource/vocabulary.py index f07fb1c..6e9ba87 100644 --- a/rsc/lib/vocabulary.py +++ b/src/main/python/khaiii/resource/vocabulary.py @@ -4,17 +4,17 @@ """ vocabulary library __author__ = 'Jamie (jamie.lim@kakaocorp.com)' -__copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' """ ########### # imports # ########### -import codecs import copy import logging import os +from typing import List ######### @@ -24,7 +24,7 @@ class Vocabulary: """ vocabulary class """ - def __init__(self, path, cutoff=1, special=None, padding=''): + def __init__(self, path: str, cutoff: int = 1, special: List[str] = None, padding: str = ''): """ Args: path: file path @@ -60,7 +60,7 @@ def __getitem__(self, key): except KeyError: return 0 # unknown word number - def __len__(self): + def __len__(self) -> int: return len(self.dic) ''' @@ -78,7 +78,7 @@ def get_embedding(self, dim, padding_idx=None): return nn.Embedding(len(self), dim) ''' # pylint: disable=pointless-string-statement - def padding_idx(self): + def padding_idx(self) -> int: """ 맨 마지막에 추가한 패딩의 인덱스를 리턴한다. Returns: @@ -88,7 +88,7 @@ def padding_idx(self): raise RuntimeError('vocabulary has no padding') return self.dic[self.padding] - def _load(self, path, cutoff=1): + def _load(self, path: str, cutoff: int = 1): """ load vocabulary from file Args: @@ -97,7 +97,7 @@ def _load(self, path, cutoff=1): """ append_num = 0 cutoff_num = 0 - for line in codecs.open(path, 'r', encoding='UTF-8'): + for line in open(path, 'r', encoding='UTF-8'): line = line.rstrip('\r\n') if not line: continue