Merge branch 'release/v0.3'

kakao · Feb 12, 2019 · e29c266 · e29c266
2 parents 4b46d5e + eef172f
commit e29c266
Show file tree

Hide file tree

Showing 57 changed files with 5,126 additions and 893 deletions.
diff --git a/.github/img/network.png b/.github/img/network.png
diff --git a/.github/img/network.pptx b/.github/img/network.pptx
diff --git a/doc/img/pull-request-to-develop.png → .github/img/pull-request-to-develop.png b/doc/img/pull-request-to-develop.png → .github/img/pull-request-to-develop.png
diff --git a/doc/img/win_emb_f.png → .github/img/win_emb_f.png b/doc/img/win_emb_f.png → .github/img/win_emb_f.png
diff --git a/NOTICE.md b/NOTICE.md
@@ -97,16 +97,6 @@ Copyright 2013-2018 Niels Lohmann
 MIT License 
 
 
- **NumPy**
-
-https://github.com/numpy/numpy
-
-Copyright 2005-2018, NumPy Developers.
-
-
-BSD 3-Clause "New" or "Revised" License 
-
-
  **PyTorch**
 
 https://github.com/pytorch/pytorch
@@ -124,6 +114,16 @@ https://github.com/gabime/spdlog
 Copyright 2016 Gabi Melman
 
 
+MIT License 
+
+
+ **tensorboardX**
+
+https://github.com/lanpa/tensorboardX
+
+Copyright 2017 Tzu-Wei Huang
+
+
 MIT License 
 
 

diff --git a/README.md b/README.md
@@ -9,22 +9,22 @@ khaiii는 "Kakao Hangul Analyzer III"의 첫 글자들만 모아 만든 이름
 ----
 기존 버전이 사전과 규칙에 기반해 분석을 하는 데 반해 khaiii는 데이터(혹은 기계학습) 기반의 알고리즘을 이용하여 분석을 합니다. 학습에 사용한 코퍼스는 국립국어원에서 배포한 [21세기 세종계획 최종 성과물](https://ithub.korean.go.kr/user/noticeView.do?boardSeq=1&articleSeq=16)을 저희 카카오에서 오류를 수정하고 내용을 일부 추가하기도 한 것입니다.
 
-전처리 과정에서 오류가 발생하는 문장을 제외하고 약 85만 문장, 천만 어절의 코퍼스를 사용하여 학습을 했습니다. 코퍼스와 품사 체계에 대한 자세한 내용은 [코퍼스 문서](doc/corpus.md)를 참고하시기 바랍니다.
+전처리 과정에서 오류가 발생하는 문장을 제외하고 약 85만 문장, 천만 어절의 코퍼스를 사용하여 학습을 했습니다. 코퍼스와 품사 체계에 대한 자세한 내용은 [코퍼스](https://github.com/kakao/khaiii/wiki/%EC%BD%94%ED%8D%BC%EC%8A%A4) 문서를 참고하시기 바랍니다.
 
 
 알고리즘
 ----
 기계학습에 사용한 알고리즘은 신경망 알고리즘들 중에서 Convolutional Neural Network(CNN)을 사용하였습니다. 한국어에서 형태소분석은 자연어처리를 위한 가장 기본적인 전처리 과정이므로 속도가 매우 중요한 요소라고 생각합니다. 따라서 자연어처리에 많이 사용하는 Long-Short Term Memory(LSTM)와 같은 Recurrent Neural Network(RNN) 알고리즘은 속도 면에서 활용도가 떨어질 것으로 예상하여 고려 대상에서 제외하였습니다.
 
-CNN 모델에 대한 상세한 내용은 [CNN 모델 문서](doc/cnn_model.md)를 참고하시기 바랍니다.
+CNN 모델에 대한 상세한 내용은 [CNN 모델](https://github.com/kakao/khaiii/wiki/CNN-%EB%AA%A8%EB%8D%B8) 문서를 참고하시기 바랍니다.
 
 
 성능
 ----
 ### 정확도
 CNN 모델의 주요 하이퍼 파라미터는 분류하려는 음절의 좌/우 문맥의 크기를 나타내는 win 값과, 음절 임베딩의 차원을 나타내는 emb 값입니다. win 값은 {2, 3, 4, 5, 7, 10}의 값을 가지며, emb 값은 {20, 30, 40, 50, 70, 100, 150, 200, 300, 500}의 값을 가집니다. 따라서 이 두 가지 값의 조합은 6 x 10으로 총 60가지를 실험하였고 아래와 같은 성능을 보였습니다. 성능 지표는 정확률과 재현율의 조화 평균값인 F-Score입니다.
 
-![](doc/img/win_emb_f.png)
+![](.github/img/win_emb_f.png)
 
 win 파라미터의 경우 3 혹은 4에서 가장 좋은 성능을 보이며 그 이상에서는 오히려 성능이 떨어집니다. emb 파라미터의 경우 150까지는 성능도 같이 높아지다가 그 이상에서는 별 차이가 없습니다. 최 상위 5위 중 비교적 작은 모델은 win=3, emb=150으로 F-Score 값은 97.11입니다. 이 모델을 large 모델이라 명명합니다.
 

diff --git a/doc/cnn_model.md b/doc/cnn_model.md
diff --git a/doc/corpus.md b/doc/corpus.md
diff --git a/doc/img/network.png b/doc/img/network.png
diff --git a/doc/img/network.pptx b/doc/img/network.pptx
diff --git a/include/khaiii/khaiii_api.h b/include/khaiii/khaiii_api.h
@@ -12,7 +12,7 @@
 // constants //
 ///////////////
 #define KHAIII_VERSION_MAJOR 0
-#define KHAIII_VERSION_MINOR 2
+#define KHAIII_VERSION_MINOR 3
 #define _MAC2STR(m) #m
 #define _JOIN_VER(x,y) _MAC2STR(x) "." _MAC2STR(y)    // NOLINT
 #define KHAIII_VERSION _JOIN_VER(KHAIII_VERSION_MAJOR,KHAIII_VERSION_MINOR)    // NOLINT

diff --git a/munjong/apply_patch.py b/munjong/apply_patch.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+"""
+apply patch to original Sejong corpus
+__author__ = 'Jamie (jamie.lim@kakaocorp.com)'
+__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
+"""
+
+
+###########
+# imports #
+###########
+from argparse import ArgumentParser, Namespace
+import logging
+import os
+import shutil
+
+from khaiii.munjong import libpatch
+
+
+#############
+# functions #
+#############
+def run(args: Namespace):
+    """
+    run function which is the start point of program
+    Args:
+        args:  program arguments
+    """
+    if not os.path.exists(args.modified):
+        logging.info('creating modified corpus dir: %s', args.modified)
+        os.mkdir(args.modified)
+
+    for name in sorted(os.listdir(args.original)):
+        if not name.endswith('.txt'):
+            continue
+        org_path = '%s/%s' % (args.original, name)
+        mod_path = '%s/%s' % (args.modified, name)
+        patch_path = '%s/%s.patch' % (args.patch, name[:-len('.txt')])
+        if os.path.exists(patch_path):
+            logging.info('[%s] + [%s] = [%s]', org_path, patch_path, mod_path)
+            libpatch.apply(org_path, args.org_enc, patch_path, mod_path, args.mod_enc)
+        else:
+            logging.info('[%s] = [%s]', org_path, mod_path)
+            shutil.copyfile(org_path, mod_path)
+
+
+########
+# main #
+########
+def main():
+    """
+    main function processes only argument parsing
+    """
+    parser = ArgumentParser(description='apply patch to original Sejong corpus')
+    parser.add_argument('-o', '--original', help='original corpus dir', metavar='DIR',
+                        required=True)
+    parser.add_argument('-p', '--patch', help='patch dir', metavar='DIR', required=True)
+    parser.add_argument('-m', '--modified', help='modified corpus output dir', metavar='DIR',
+                        required=True)
+    parser.add_argument('--org-enc', help='original corpus encoding <default: UTF-16>',
+                        metavar='ENCODING', default='UTF-16')
+    parser.add_argument('--mod-enc', help='modified corpus encoding <default: UTF-8>',
+                        metavar='ENCODING', default='UTF-8')
+    parser.add_argument('--debug', help='enable debug', action='store_true')
+    args = parser.parse_args()
+
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    run(args)
+
+
+if __name__ == '__main__':
+    main()