Skip to content

Commit

Permalink
좌/우 공백 가상 음절의 임베딩을 elementwise로 더하는 모델의 학습 코드 완료 #49
Browse files Browse the repository at this point in the history
  • Loading branch information
krikit committed Feb 11, 2019
1 parent eef172f commit 78930c2
Show file tree
Hide file tree
Showing 8 changed files with 162 additions and 123 deletions.
13 changes: 4 additions & 9 deletions src/main/python/khaiii/resource/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,8 @@
#############
# constants #
#############
SPECIAL_CHARS = [
'<u>', # unknown character
'<w>', '</w>', # begin/end of word
'<s>', '</s>' # begin/end of sentence
]

PAD_CHR = '<p>' # sepcial character for padding
UNK_CHR = '@@UNKNOWN@@'
SPECIAL_CHARS = ['<w>', '</w>'] # begin/end of word


#########
Expand All @@ -45,9 +40,9 @@ def __init__(self, cfg: Namespace):
cfg: config
"""
vocab_in_path = '{}/vocab.in'.format(cfg.rsc_src)
self.vocab_in = Vocabulary(vocab_in_path, cfg.cutoff, SPECIAL_CHARS)
self.vocab_in = Vocabulary(vocab_in_path, cfg.cutoff, UNK_CHR, SPECIAL_CHARS)
vocab_out_path = '{}/vocab.out'.format(cfg.rsc_src)
self.vocab_out = Vocabulary(vocab_out_path, 0, None)
self.vocab_out = Vocabulary(vocab_out_path) # no unknown, no special
restore_dic_path = '{}/restore.dic'.format(cfg.rsc_src)
self.restore_dic = self._load_restore_dic(restore_dic_path)

Expand Down
36 changes: 13 additions & 23 deletions src/main/python/khaiii/resource/vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
###########
# imports #
###########
import copy
import logging
import os
from typing import List
Expand All @@ -24,26 +23,25 @@ class Vocabulary:
"""
vocabulary class
"""
def __init__(self, path: str, cutoff: int = 1, special: List[str] = None, padding: str = ''):
def __init__(self, path: str, cutoff: int = 1, unk: str = '', special: List[str] = None):
"""
padding index is always 0. None and '' get padding index.
if `unk` is given (such as input vocab), its index is always 1.
if `unk` is not given (such as output vocab), an exception will be thrown for unknown entry
Args:
path: file path
cutoff: cutoff frequency
unk: unknown(OOV) entry
special: special entries located at the first
padding: add padding special char at the end
"""
self.dic = {} # {entry: number} dictionary
self.rev = copy.deepcopy(special) if special else [] # reverse dictionary
self.unk = unk
self.rev = ['', unk] if unk else ['', ] # reverse dictionary
if special:
self.rev.extend(special)
for num, entry in enumerate(self.rev):
self.dic[entry] = num
self._load(path, cutoff)
self.padding = padding
if padding:
if padding in self.dic:
raise ValueError('padding special character already in vocab: {}'.format(padding))
padding_idx = len(self.dic)
self.dic[padding] = padding_idx
self.rev.append(padding)
assert len(self.dic) == len(self.rev)

def __getitem__(self, key):
Expand All @@ -57,22 +55,14 @@ def __getitem__(self, key):
return self.rev[key]
try:
return self.dic[key]
except KeyError:
return 0 # unknown word number
except KeyError as key_err:
if self.unk:
return self.dic[self.unk]
raise key_err

def __len__(self):
return len(self.dic)

def padding_idx(self) -> int:
"""
맨 마지막에 추가한 패딩의 인덱스를 리턴한다.
Returns:
패딩 인덱스
"""
if not self.padding:
raise RuntimeError('vocabulary has no padding')
return self.dic[self.padding]

def _load(self, path: str, cutoff: int = 1):
"""
load vocabulary from file
Expand Down
169 changes: 101 additions & 68 deletions src/main/python/khaiii/train/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# imports #
###########
from argparse import Namespace
import itertools
import logging
import os
import random
Expand All @@ -20,7 +21,7 @@
from torch import LongTensor, Tensor # pylint: disable=no-member, no-name-in-module
from tqdm import tqdm

from khaiii.resource.resource import PAD_CHR, Resource
from khaiii.resource.resource import Resource
from khaiii.train.sentence import PosSentence, PosWord


Expand All @@ -44,97 +45,129 @@ def __len__(self):
return sum([len(w.raw) for w in self.pos_tagged_words]) + len(self.pos_tagged_words) + 1
return 0

def make_labels(self, with_spc: bool) -> List[str]:
def make_contexts(self, window: int) -> List[List[str]]:
"""
음절별로 출력 레이블(태그)를 생성한다.
음절 별로 좌/우 window 크기 만큼 context를 만든다.
Args:
with_spc: 공백(어절 경계) 포함 여부
window: left/right window size
Returns:
레이블 리스트
contexts
"""
if not with_spc:
# 문장 경계, 어절 경계 등 가상 음절을 제외하고 순수한 음절들의 레이블
return [tag for pos_word in self.pos_tagged_words for tag in pos_word.tags]
labels = [PAD_CHR, ] # 문장 시작
for pos_word in self.pos_tagged_words:
if len(labels) > 1:
labels.append(PAD_CHR) # 어절 경계
labels.extend(pos_word.tags)
labels.append(PAD_CHR) # 문장 종료
return labels

def make_contexts(self, window: int, spc_dropout: float) -> List[str]:
chars = [c for w in self.words for c in w]
chars_len = len(chars)
chars_padded = ['', ] * window + chars + ['', ] * window
contexts = [chars_padded[idx-window:idx+window+1]
for idx in range(window, chars_len + window)]
return contexts

@classmethod
def _flatten(cls, list_of_lists):
"""
flatten one level of nesting
Args:
list_of_lists: list of lists
Returns:
flattened list
"""
return list(itertools.chain.from_iterable(list_of_lists))

def make_left_spc_masks(self, window: int, left_vocab_id: int, spc_dropout: float) \
-> List[List[int]]:
"""
각 음절 별로 좌/우 window 크기 만큼 context를 만든다.
Args:
window: left/right window size
spc_dropout: space(word delimiter) dropout rate
left_vocab_id: vocabulary ID for '<w>'
spc_dropout: space dropout rate
Returns:
contexts
left space masks
"""
contexts = []
for wrd_idx, word in enumerate(self.words):
for chr_idx, char in enumerate(word):
left_context = list(reversed(word[:chr_idx]))
if random.random() >= spc_dropout:
left_context.append('<w>')
for left_word in reversed(self.words[:wrd_idx]):
left_context.extend(reversed(left_word))
if len(left_context) >= window:
break
if len(left_context) < window:
left_context.extend(['<s>', ] * (window - len(left_context)))
left_context = list(reversed(left_context[:window]))
assert len(left_context) == window

right_context = list(word[chr_idx+1:])
if random.random() >= spc_dropout:
right_context.append('</w>')
for right_word in self.words[wrd_idx+1:]:
right_context.extend(list(right_word))
if len(right_context) >= window:
break
if len(right_context) < window:
right_context.extend(['</s>', ] * (window - len(right_context)))
right_context = right_context[:window]
assert len(right_context) == window
contexts.append(left_context + [char, ] + right_context)
return contexts

def to_tensor(self, cfg: Namespace, rsc: Resource, is_train: bool) -> Tuple[Tensor, Tensor]:
def _filter_left_spc_mask(left_spc_mask):
"""
중심 음절로부터 첫번째 왼쪽 공백만 남기고 나머지는 제거한다.
Args:
left_spc_mask: 왼쪽 공백 마스크
"""
for idx in range(window, -1, -1):
if left_spc_mask[idx] == left_vocab_id:
if random.random() < spc_dropout:
left_spc_mask[idx] = 0
for jdx in range(idx-1, -1, -1):
left_spc_mask[jdx] = 0
break

left_spcs = self._flatten([[left_vocab_id, ] + [0, ] * (len(word)-1)
for word in self.words])
left_padded = [0, ] * window + left_spcs + [0, ] * window
left_spc_masks = [left_padded[idx-window:idx+1] + [0, ] * window
for idx in range(window, len(left_spcs) + window)]
for left_spc_mask in left_spc_masks:
_filter_left_spc_mask(left_spc_mask)
return left_spc_masks

def make_right_spc_masks(self, window: int, right_vocab_id: int, spc_dropout: float) \
-> List[List[int]]:
"""
각 음절 별로 좌/우 window 크기 만큼 context를 만든다.
Args:
window: left/right window size
right_vocab_id: vocabulary ID for '</w>'
spc_dropout: space dropout rate
Returns:
right space masks
"""
def _filter_right_spc_mask(right_spc_mask):
"""
중심 음절로부터 첫번째 오른쪽 공백만 남기고 나머지는 제거한다.
Args:
right_spc_mask: 오른쪽 공백 마스크
"""
for idx in range(window, len(right_spc_mask)):
if right_spc_mask[idx] == right_vocab_id:
if random.random() < spc_dropout:
right_spc_mask[idx] = 0
for jdx in range(idx+1, len(right_spc_mask)):
right_spc_mask[jdx] = 0
break

right_spcs = self._flatten([[0, ] * (len(word)-1) + [right_vocab_id, ]
for word in self.words])
right_padded = [0, ] * window + right_spcs + [0, ] * window
right_spc_masks = [[0, ] * window + right_padded[idx:idx+window+1]
for idx in range(window, len(right_spcs) + window)]
for right_spc_mask in right_spc_masks:
_filter_right_spc_mask(right_spc_mask)
return right_spc_masks

def to_tensor(self, cfg: Namespace, rsc: Resource, do_spc_dropout: bool) \
-> Tuple[Tensor, Tensor, Tensor, Tensor]:
"""
문장 내에 포함된 전체 음절들과 태그를 모델의 forward 메소드에 넣을 수 있는 텐서로 변환한다.
Args:
cfg: config
rsc: Resource object
is_train: whether is train or not
do_spc_dropout: whether do space dropout or not
Returns:
labels tensor
contexts tensor
left space masks tensor
right space masks tensor
"""
# 차원: [문장내 음절 갯수, ]
label_nums = [rsc.vocab_out[l] for l in self.make_labels(False)]
label_nums = [rsc.vocab_out[tag] for pos_word in self.pos_tagged_words \
for tag in pos_word.tags]
labels_tensor = LongTensor(label_nums)
# 차원: [문장내 음절 갯수 x context 크기]
spc_dropout = cfg.spc_dropout if is_train else 0.0
context_nums = [[rsc.vocab_in[c] for c in context] \
for context in self.make_contexts(cfg.window, spc_dropout)]
contexts = self.make_contexts(cfg.window)
context_nums = [[rsc.vocab_in[c] for c in context] for context in contexts]
contexts_tensor = LongTensor(context_nums)
return labels_tensor, contexts_tensor
spc_dropout = cfg.spc_dropout if do_spc_dropout else 0.0
left_spc_masks = self.make_left_spc_masks(cfg.window, rsc.vocab_in['<w>'], spc_dropout)
left_spc_masks_tensor = LongTensor(left_spc_masks)
right_spc_masks = self.make_right_spc_masks(cfg.window, rsc.vocab_in['</w>'], spc_dropout)
right_spc_masks_tensor = LongTensor(right_spc_masks)

def make_chars(self) -> List[str]:
"""
문장 내 포함된 음절들을 만든다. 문장 경계 및 어절 경계를 포함한다.
Returns:
음절의 리스트
"""
chars = ['<s>', ] # 문장 시작
for word in self.words:
if len(chars) > 1:
chars.append('<w>') # 어절 경계
chars.extend(word)
chars.append('</s>') # 문장 종료
return chars
return labels_tensor, contexts_tensor, left_spc_masks_tensor, right_spc_masks_tensor


class PosDataset:
Expand Down
11 changes: 8 additions & 3 deletions src/main/python/khaiii/train/embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,22 @@ def __init__(self, cfg: Namespace, rsc: Resource):
super().__init__()
self.cfg = cfg
self.rsc = rsc
self.embedding = nn.Embedding(len(rsc.vocab_in), cfg.embed_dim)
self.embedding = nn.Embedding(len(rsc.vocab_in), cfg.embed_dim, 0)

def forward(self, inputs): # pylint: disable=arguments-differ
"""
임베딩을 생성하는 메소드
Args:
inputs: contexts of batch size
inputs: batch size list of (context, left space mask, right space mask)
Returns:
embedding
"""
embeds = self.embedding(inputs)
contexts, left_spc_masks, right_spc_masks = inputs
embeds = self.embedding(contexts)
embeds += self.embedding(left_spc_masks)
embeds += self.embedding(right_spc_masks)
# 왼쪽과 오른쪽 패딩에는 zero 벡터인데 아래 positional encoding이 더해짐
# 사소하지만 아래도 패딩 영역에 대해 마스킹 후 더해줘야 하지 않을까?
embeds += positional_encoding(self.cfg.context_len, self.cfg.context_len,
self.cfg.embed_dim, 1)
return embeds
Expand Down
21 changes: 14 additions & 7 deletions src/main/python/khaiii/train/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,12 @@ def evaluate(self) -> Tuple[float, float, float]:
"""
char_acc = self.cnt['match_chars'] / self.cnt['total_chars']
word_acc = self.cnt['match_words'] / self.cnt['total_words']
recall = self.cnt['match_morphs'] / self.cnt['total_gold_morphs']
precision = self.cnt['match_morphs'] / self.cnt['total_pred_morphs']
f_score = 2.0 * recall * precision / (recall + precision)
if self.cnt['match_morphs'] == 0:
recall = precision = f_score = 0.0
else:
recall = self.cnt['match_morphs'] / self.cnt['total_gold_morphs']
precision = self.cnt['match_morphs'] / self.cnt['total_pred_morphs']
f_score = 2.0 * recall * precision / (recall + precision)
self.cnt.clear()
return char_acc, word_acc, f_score

Expand Down Expand Up @@ -104,13 +107,17 @@ def morphs_to_set(cls, morphs: List[PosMorph]) -> set:
def report(self, fout: TextIO):
"""
report recall/precision to file
:param fout: output file
Args:
fout: output file
"""
print('word accuracy: %d / %d = %.4f' % (self.cnt['match_words'], self.cnt['total_words'],
self.cnt['match_words'] / self.cnt['total_words']),
file=fout)
recall = self.cnt['match_morphs'] / self.cnt['total_gold_morphs']
precision = self.cnt['match_morphs'] / self.cnt['total_pred_morphs']
f_score = 2.0 * recall * precision / (recall + precision)
if self.cnt['match_morphs'] == 0:
recall = precision = f_score = 0.0
else:
recall = self.cnt['match_morphs'] / self.cnt['total_gold_morphs']
precision = self.cnt['match_morphs'] / self.cnt['total_pred_morphs']
f_score = 2.0 * recall * precision / (recall + precision)
print('f-score / (recall, precision): %.4f / (%.4f, %.4f)' % (f_score, recall, precision),
file=fout)
Loading

0 comments on commit 78930c2

Please sign in to comment.