Skip to content

Commit

Permalink
학습 스크립트를 추가하기 위해 rsc/lib 아래의 python 모듈들을 src/main/python/khaiii 아래로 정…
Browse files Browse the repository at this point in the history
…리. type hint 관련 코드 리팩토링 #30
  • Loading branch information
krikit committed Feb 10, 2019
1 parent 896a091 commit 918955f
Show file tree
Hide file tree
Showing 16 changed files with 404 additions and 372 deletions.
10 changes: 5 additions & 5 deletions rsc/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
HOME_DIR = .
BIN_DIR = $(HOME_DIR)/bin
LIB_DIR = $(HOME_DIR)/lib
SRC_PYTHON = $(HOME_DIR)/../src/main/python
RSC_SRC = $(HOME_DIR)/src
PREFIX = /usr/local
RSC_DIR = $(PREFIX)/share/khaiii
Expand Down Expand Up @@ -35,22 +35,22 @@ all: $(MODEL) $(PREANAL) $(RESTORE) $(ERRPATCH)
$(wordlist 2,100,$(MODEL)): $(firstword $(MODEL))
$(firstword $(MODEL)): $(RSC_SRC)/$(MODEL_SIZE).config.json $(RSC_SRC)/$(MODEL_SIZE).model.pickle
mkdir -p $(RSC_DIR)
PYTHONPATH=$(LIB_DIR) python3 $(BIN_DIR)/compile_model.py --model-size $(MODEL_SIZE) --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR)
PYTHONPATH=$(SRC_PYTHON) python3 $(BIN_DIR)/compile_model.py --model-size $(MODEL_SIZE) --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR)

$(wordlist 2,100,$(PREANAL)): $(firstword $(PREANAL))
$(firstword $(PREANAL)): $(RSC_SRC)/preanal.auto $(RSC_SRC)/preanal.manual
mkdir -p $(RSC_DIR)
PYTHONPATH=$(LIB_DIR) python3 $(BIN_DIR)/compile_preanal.py --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR)
PYTHONPATH=$(SRC_PYTHON) python3 $(BIN_DIR)/compile_preanal.py --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR)

$(wordlist 2,100,$(RESTORE)): $(firstword $(RESTORE))
$(firstword $(RESTORE)): $(RSC_SRC)/restore.dic $(RSC_SRC)/vocab.out $(RSC_SRC)/vocab.out.more
mkdir -p $(RSC_DIR)
PYTHONPATH=$(LIB_DIR) python3 $(BIN_DIR)/compile_restore.py --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR)
PYTHONPATH=$(SRC_PYTHON) python3 $(BIN_DIR)/compile_restore.py --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR)

$(wordlist 2,100,$(ERRPATCH)): $(firstword $(ERRPATCH))
$(firstword $(ERRPATCH)): $(RSC_SRC)/$(MODEL_SIZE).errpatch.auto $(RSC_SRC)/$(MODEL_SIZE).errpatch.manual
mkdir -p $(RSC_DIR)
PYTHONPATH=$(LIB_DIR) python3 $(BIN_DIR)/compile_errpatch.py --model-size $(MODEL_SIZE) --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR)
PYTHONPATH=$(SRC_PYTHON) python3 $(BIN_DIR)/compile_errpatch.py --model-size $(MODEL_SIZE) --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR)

clean:
rm -rf $(RSC_DIR)
41 changes: 22 additions & 19 deletions rsc/bin/compile_errpatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,31 @@
"""
오분석 패치를 빌드하는 스크립트
__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
__copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.'
__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
"""


###########
# imports #
###########
import argparse
from argparse import ArgumentParser, Namespace
from collections import defaultdict
import glob
import itertools
import logging
import os
import struct
import sys
from typing import Dict, List, Tuple

from khaiii.munjong import sejong_corpus
from khaiii.resource.char_align import Aligner, AlignError
from khaiii.resource.morphs import Morph, ParseError
from khaiii.resource.morphs import WORD_DELIM_STR, SENT_DELIM_STR, WORD_DELIM_NUM, SENT_DELIM_NUM
from khaiii.resource.trie import Trie

from char_align import Aligner, AlignError
from compile_preanal import align_to_tag, print_errors
from compile_restore import load_restore_dic, load_vocab_out
from morphs import Morph, ParseError
from morphs import WORD_DELIM_STR, SENT_DELIM_STR, WORD_DELIM_NUM, SENT_DELIM_NUM
import sejong_corpus
from trie import Trie


#########
Expand All @@ -37,7 +39,7 @@ class Entry:
"""
error patch entry
"""
def __init__(self, file_path, line_num, line):
def __init__(self, file_path: str, line_num: int, line: str):
"""
Args:
file_path: 파일 경로
Expand Down Expand Up @@ -65,7 +67,7 @@ def __str__(self):
return '{}: "{}"'.format(file_num, line)
return '{}\t{}\t{}'.format(self.raw, Morph.to_str(self.left), Morph.to_str(self.right))

def key_str(self):
def key_str(self) -> str:
"""
패치의 중복 검사를 하기 위해 원문과 left를 이용하여 키를 생성
Returns:
Expand Down Expand Up @@ -102,7 +104,7 @@ def _parse(self):
#############
# functions #
#############
def _split_list(lst, delim):
def _split_list(lst: List[str], delim: str) -> List[List[str]]:
"""
리스트를 delimiter로 split하는 함수
Expand All @@ -125,7 +127,8 @@ def _split_list(lst, delim):
return sublists


def align_patch(rsc_src, raw, morph_str):
def align_patch(rsc_src: Tuple[Aligner, Dict, Dict[str, int]], raw: str, morph_str: str) \
-> List[int]:
"""
패치의 원문과 분석 결과를 음절단위 매핑(정렬)을 수행한다.
Args:
Expand Down Expand Up @@ -171,7 +174,7 @@ def align_patch(rsc_src, raw, morph_str):
return tag_nums


def mix_char_tag(chars, tags):
def mix_char_tag(chars: str, tags: List[int]) -> List[int]:
"""
음절과 출력 태그를 비트 연산으로 합쳐서 하나의 (32비트) 숫자로 표현한다.
Args:
Expand All @@ -195,11 +198,11 @@ def mix_char_tag(chars, tags):
return char_nums


def _load_entries(args):
def _load_entries(args: Namespace) -> List[Entry]:
"""
패치 엔트리를 파일로부터 로드한다.
Args:
args: arguments
args: program arguments
Returns:
엔트리 리스트
"""
Expand All @@ -221,7 +224,7 @@ def _load_entries(args):
return good_entries


def _check_dup(entries):
def _check_dup(entries: List[Entry]):
"""
중복된 엔트리가 없는 지 확인한다.
Args:
Expand All @@ -239,7 +242,7 @@ def _check_dup(entries):
print_errors(bad_entries)


def _set_align(rsc_src, entries): # pylint: disable=invalid-name
def _set_align(rsc_src: Tuple[Aligner, dict, Dict[str, int]], entries: List[Entry]):
"""
음절과 형태소 분석 결과를 정렬한다.
Args:
Expand All @@ -265,7 +268,7 @@ def _set_align(rsc_src, entries): # pylint: disable=invalid-name
print_errors(bad_entries)


def _save_trie(rsc_dir, entries):
def _save_trie(rsc_dir: str, entries: List[Entry]):
"""
트라이를 저장한다.
Args:
Expand Down Expand Up @@ -309,7 +312,7 @@ def _save_trie(rsc_dir, entries):
(sum([len(r) for r in rights])+1) * struct.Struct('h').size)


def run(args):
def run(args: Namespace):
"""
run function which is the start point of program
Args:
Expand Down Expand Up @@ -338,7 +341,7 @@ def main():
"""
main function processes only argument parsing
"""
parser = argparse.ArgumentParser(description='기분석 사전을 빌드하는 스크립트')
parser = ArgumentParser(description='기분석 사전을 빌드하는 스크립트')
parser.add_argument('--model-size', help='model size <default: base>',
metavar='SIZE', default='base')
parser.add_argument('--rsc-src', help='source directory (text) <default: ./src>',
Expand Down
11 changes: 5 additions & 6 deletions rsc/bin/compile_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,22 @@
"""
compile trained model for C/C++ decoder
__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
__copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.'
__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
"""


###########
# imports #
###########
import argparse
from argparse import Namespace
from argparse import ArgumentParser, Namespace
import json
import logging
import os
import pathlib
import pickle
from typing import Tuple

from resource import Resource # pylint: disable=wrong-import-order
from khaiii.resource.resource import Resource


#############
Expand All @@ -40,7 +39,7 @@ def load_cfg_rsc(rsc_src: str, model_size: str) -> Tuple[Namespace, Resource]:
file_path = '{}/{}.config.json'.format(rsc_src, model_size)
cfg_dic = json.load(open(file_path, 'r', encoding='UTF-8'))
logging.info('config: %s', json.dumps(cfg_dic, indent=2))
cfg = argparse.Namespace()
cfg = Namespace()
for key, val in cfg_dic.items():
setattr(cfg, key, val)
cwd = os.path.realpath(os.getcwd())
Expand Down Expand Up @@ -159,7 +158,7 @@ def main():
"""
main function processes only argument parsing
"""
parser = argparse.ArgumentParser(description='part-of-speech tagger')
parser = ArgumentParser(description='part-of-speech tagger')
parser.add_argument('--model-size', help='model size <default: base>',
metavar='SIZE', default='base')
parser.add_argument('--rsc-src', help='source directory (model) <default: ./src>',
Expand Down
41 changes: 23 additions & 18 deletions rsc/bin/compile_preanal.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,36 +5,38 @@
"""
기분석 사전을 빌드하는 스크립트
__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
__copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.'
__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
"""


###########
# imports #
###########
import argparse
from argparse import ArgumentParser, Namespace
from collections import defaultdict
import glob
import logging
import os
import struct
import sys
from typing import Dict, List, Tuple

from khaiii.munjong import sejong_corpus
from khaiii.resource.char_align import Aligner, AlignError, MrpChr
from khaiii.resource.morphs import Morph, ParseError
from khaiii.resource.trie import Trie

from compile_restore import load_restore_dic, load_vocab_out, append_new_entries
from char_align import Aligner, AlignError, MrpChr
from morphs import Morph, ParseError
import sejong_corpus
from trie import Trie


#########
# types #
#########
class Entry(object):
class Entry:
"""
pre-analyzed dictionary entry
"""
def __init__(self, file_path, line_num, line):
def __init__(self, file_path: str, line_num: int, line: str):
"""
Args:
file_path: 파일 경로
Expand All @@ -59,7 +61,7 @@ def __str__(self):
line = '# {}'.format(self.line) if self.is_sharp else self.line
if self.err_msg:
return '{}{}: "{}"'.format(file_num, self.err_msg, line)
elif self.is_sharp:
if self.is_sharp:
return '{}: "{}"'.format(file_num, line)
return '{}{}\t{}'.format(self.word, '*' if self.is_pfx else '', Morph.to_str(self.morphs))

Expand Down Expand Up @@ -94,7 +96,7 @@ def _parse(self):
#############
# functions #
#############
def print_errors(entries):
def print_errors(entries: List[Entry]):
"""
에러가 발생한 엔트리를 출력하고 프로그램을 종료한다.
Args:
Expand All @@ -107,7 +109,7 @@ def print_errors(entries):
sys.exit(1)


def _load_entries(args):
def _load_entries(args: Namespace) -> List[Entry]:
"""
사전 엔트리를 파일로부터 로드한다.
Args:
Expand All @@ -133,7 +135,7 @@ def _load_entries(args):
return good_entries


def _check_dup(entries):
def _check_dup(entries: List[Entry]):
"""
중복된 엔트리가 없는 지 확인한다.
Args:
Expand All @@ -151,7 +153,7 @@ def _check_dup(entries):
print_errors(bad_entries)


def _set_align(aligner, Word, entries): # pylint: disable=invalid-name
def _set_align(aligner: Aligner, Word: type, entries: List[Entry]): # pylint: disable=invalid-name
"""
음절과 형태소 분석 결과를 정렬한다.
Args:
Expand All @@ -173,7 +175,9 @@ def _set_align(aligner, Word, entries): # pylint: disable=invalid-name
print_errors(bad_entries)


def align_to_tag(raw_word, alignment, restore, vocab):
def align_to_tag(raw_word: str, alignment: List[List[str]], restore: Tuple[dict, dict],
vocab: Tuple[Dict[str, int], Dict[str, int]]) \
-> Tuple[List[str], List[int]]:
"""
어절의 원문과 정렬 정보를 활용해 음절과 매핑된 태그를 생성한다.
Args:
Expand Down Expand Up @@ -224,7 +228,8 @@ def align_to_tag(raw_word, alignment, restore, vocab):
return tag_outs, tag_nums


def _set_tag_out(restore_dic, restore_new, vocab_out, vocab_new, entries):
def _set_tag_out(restore_dic: dict, restore_new: dict, vocab_out: Dict[str, int],
vocab_new: Dict[str, int], entries: List[Entry]):
"""
음절 정렬로부터 출력 태그를 결정하고 출력 태그의 번호를 매핑한다.
Args:
Expand All @@ -240,7 +245,7 @@ def _set_tag_out(restore_dic, restore_new, vocab_out, vocab_new, entries):
(vocab_out, vocab_new))


def _save_trie(rsc_dir, entries):
def _save_trie(rsc_dir: str, entries: List[Entry]):
"""
트라이를 저장한다.
Args:
Expand Down Expand Up @@ -270,7 +275,7 @@ def _save_trie(rsc_dir, entries):
(sum([len(e.tag_nums) for e in entries])+1) * struct.Struct('H').size)


def run(args):
def run(args: Namespace):
"""
run function which is the start point of program
Args:
Expand Down Expand Up @@ -301,7 +306,7 @@ def main():
"""
main function processes only argument parsing
"""
parser = argparse.ArgumentParser(description='기분석 사전을 빌드하는 스크립트')
parser = ArgumentParser(description='기분석 사전을 빌드하는 스크립트')
parser.add_argument('--rsc-src', help='source directory (text) <default: ./src>',
metavar='DIR', default='./src')
parser.add_argument('--rsc-dir', help='target directory (binary) <default: ./share/khaiii>',
Expand Down
Loading

0 comments on commit 918955f

Please sign in to comment.