Skip to content

Commit

Permalink
띄어쓰기 모델과 multi-task learning으로 함께 학습하는 모델의 학습 코드 완료 #58
Browse files Browse the repository at this point in the history
  • Loading branch information
krikit committed Apr 16, 2019
1 parent 78930c2 commit 2fb326c
Show file tree
Hide file tree
Showing 7 changed files with 215 additions and 161 deletions.
78 changes: 57 additions & 21 deletions src/main/python/khaiii/train/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
import random
from typing import List, TextIO, Tuple

from torch import LongTensor, Tensor # pylint: disable=no-member, no-name-in-module
import torch
from torch import Tensor
from tqdm import tqdm

from khaiii.resource.resource import Resource
Expand All @@ -45,6 +46,20 @@ def __len__(self):
return sum([len(w.raw) for w in self.pos_tagged_words]) + len(self.pos_tagged_words) + 1
return 0

@classmethod
def to_tensor(cls, arr: List, gpu_num: int = -1) -> Tensor:
"""
Args:
arr: array to convert
gpu_num: GPU device number. default: -1 for CPU
Returns:
tensor
"""
# pylint: disable=no-member
device = torch.device('cuda', gpu_num) if torch.cuda.is_available() and gpu_num >= 0 \
else torch.device('cpu')
return torch.tensor(arr, device=device) # pylint: disable=not-callable

def make_contexts(self, window: int) -> List[List[str]]:
"""
각 음절 별로 좌/우 window 크기 만큼 context를 만든다.
Expand Down Expand Up @@ -139,35 +154,56 @@ def _filter_right_spc_mask(right_spc_mask):
_filter_right_spc_mask(right_spc_mask)
return right_spc_masks

def to_tensor(self, cfg: Namespace, rsc: Resource, do_spc_dropout: bool) \
-> Tuple[Tensor, Tensor, Tensor, Tensor]:
def get_contexts(self, cfg: Namespace, rsc: Resource) -> List[List[int]]:
"""
문장 내에 포함된 전체 음절들과 태그를 모델의 forward 메소드에 넣을 수 있는 텐서로 변환한다.
문맥을 반환하는 메서드
Args:
cfg: config
rsc: Resource object
do_spc_dropout: whether do space dropout or not
Returns:
labels tensor
contexts tensor
left space masks tensor
right space masks tensor
"""
# 차원: [문장내 음절 갯수, ]
label_nums = [rsc.vocab_out[tag] for pos_word in self.pos_tagged_words \
for tag in pos_word.tags]
labels_tensor = LongTensor(label_nums)
# 차원: [문장내 음절 갯수 x context 크기]
Returns
문맥 리스트. shape: [(문장 내 음절 길이), (문맥의 크기)]
"""
contexts = self.make_contexts(cfg.window)
context_nums = [[rsc.vocab_in[c] for c in context] for context in contexts]
contexts_tensor = LongTensor(context_nums)
return [[rsc.vocab_in[c] for c in context] for context in contexts]

def get_spc_masks(self, cfg: Namespace, rsc: Resource, do_spc_dropout: bool) \
-> Tuple[List[List[int]], List[List[int]]]:
"""
공백 마스킹 벡터를 반환하는 메소드
Args:
cfg: config
rsc: Resource object
do_spc_dropout: 공백 마스크 시 dropout 적용 여부
Returns
좌측 공백 마스킹 벡터. shape: [(문장 내 음절 길이), (문맥의 크기)]
우측 공백 마스킹 벡터. shape: [(문장 내 음절 길이), (문맥의 크기)]
"""
spc_dropout = cfg.spc_dropout if do_spc_dropout else 0.0
left_spc_masks = self.make_left_spc_masks(cfg.window, rsc.vocab_in['<w>'], spc_dropout)
left_spc_masks_tensor = LongTensor(left_spc_masks)
right_spc_masks = self.make_right_spc_masks(cfg.window, rsc.vocab_in['</w>'], spc_dropout)
right_spc_masks_tensor = LongTensor(right_spc_masks)
return left_spc_masks, right_spc_masks

def get_labels(self, rsc: Resource) -> List[int]:
"""
레이블(출력 태그)를 반환하는 메서드
Args:
rsc: Resource object
Returns
레이블 리스트. shape: [(문장 내 음절 길이), ]
"""
return [rsc.vocab_out[tag] for pos_word in self.pos_tagged_words for tag in pos_word.tags]

return labels_tensor, contexts_tensor, left_spc_masks_tensor, right_spc_masks_tensor
def get_spaces(self) -> List[int]:
"""
음절 별 공백 여부를 반환하는 메서드
Returns
공백 여부 리스트. shape: [(문장 내 음절 길이), ]
"""
spaces = []
for word in self.words:
spaces.extend([0, ] * (len(word)-1))
spaces.append(1)
return spaces


class PosDataset:
Expand Down
19 changes: 11 additions & 8 deletions src/main/python/khaiii/train/embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __init__(self, cfg: Namespace, rsc: Resource):
self.rsc = rsc
self.embedding = nn.Embedding(len(rsc.vocab_in), cfg.embed_dim, 0)

def forward(self, inputs): # pylint: disable=arguments-differ
def forward(self, *inputs): # pylint: disable=arguments-differ
"""
임베딩을 생성하는 메소드
Args:
Expand All @@ -45,12 +45,14 @@ def forward(self, inputs): # pylint: disable=arguments-differ
"""
contexts, left_spc_masks, right_spc_masks = inputs
embeds = self.embedding(contexts)
embeds += self.embedding(left_spc_masks)
embeds += self.embedding(right_spc_masks)
if left_spc_masks is not None:
embeds += self.embedding(left_spc_masks)
if right_spc_masks is not None:
embeds += self.embedding(right_spc_masks)
# 왼쪽과 오른쪽 패딩에는 zero 벡터인데 아래 positional encoding이 더해짐
# 사소하지만 아래도 패딩 영역에 대해 마스킹 후 더해줘야 하지 않을까?
embeds += positional_encoding(self.cfg.context_len, self.cfg.context_len,
self.cfg.embed_dim, 1)
self.cfg.embed_dim, 1, self.cfg.gpu_num)
return embeds


Expand All @@ -76,7 +78,8 @@ def __missing__(self, key):


@memoize
def positional_encoding(sent_len: int, max_dim: int, embed_dim: int, method: int = 1) -> Tensor:
def positional_encoding(sent_len: int, max_dim: int, embed_dim: int, method: int = 1,
gpu_num: int = -1) -> Tensor:
"""
positional encoding Tensor 출력.
embeds [batch_size, context_len, embed_dim]에 Broadcasting 으로 더해짐
Expand All @@ -85,10 +88,12 @@ def positional_encoding(sent_len: int, max_dim: int, embed_dim: int, method: int
max_dim: maximum dimension
embed_dim: embedding dimension
method: method number (1. end-to-end memory networks or 2. attention is all you need)
gpu_num: GPU device number. default: -1 for CPU
Returns:
pe [context_len, embed_dim]
"""
pe_tensor = torch.zeros([max_dim, embed_dim]) # pylint: disable=no-member
device = gpu_num if gpu_num >= 0 else None
pe_tensor = torch.zeros([max_dim, embed_dim], device=device) # pylint: disable=no-member
for pos in range(1, sent_len + 1):
for i in range(1, embed_dim+1):
if method == 1:
Expand All @@ -101,7 +106,5 @@ def positional_encoding(sent_len: int, max_dim: int, embed_dim: int, method: int
pe_tensor[pos-1, i-1] = math.sin(pos / 10000 ** (2*i / embed_dim))
else:
pe_tensor[pos-1, i-1] = math.cos(pos / 10000 ** (2*i / embed_dim))
if torch.cuda.is_available():
pe_tensor = pe_tensor.cuda()
pe_tensor.detach()
return pe_tensor
132 changes: 72 additions & 60 deletions src/main/python/khaiii/train/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,92 +24,104 @@
#########
# types #
#########
class PosModel(nn.Module):
class ConvLayer(nn.Module):
"""
part-of-speech tagger pytorch model
형태소 태깅 모델과 띄어쓰기 모델이 공유하는 컨볼루션 레이어
"""
def __init__(self, cfg: Namespace, rsc: Resource):
"""
Args:
cfg (Namespace): config
rsc (Resource): Resource object
cfg: config
rsc: Resource object
"""
super().__init__()
self.cfg = cfg
self.rsc = rsc
self.embedder = Embedder(cfg, rsc)
ngram = min(5, cfg.window * 2 + 1)
self.convs = nn.ModuleList([nn.Conv1d(cfg.embed_dim, cfg.embed_dim, kernel_size)
for kernel_size in range(2, ngram+1)])

def forward(self, *inputs):
raise NotImplementedError
embeds = self.embedder(*inputs)
embeds_t = embeds.transpose(1, 2)
pool_outs = []
for conv in self.convs:
conv_out = F.relu(conv(embeds_t))
pool_outs.append(F.max_pool1d(conv_out, conv_out.size(2)))
features = torch.cat([p.view(embeds.size(0), -1) for p in pool_outs], dim=1) # pylint: disable=no-member
return features

def save(self, path: str):
"""
모델을 저장하는 메소드
Args:
path (str): 경로
"""
torch.save(self.state_dict(), path)

def load(self, path: str):
class HiddenLayer(nn.Module):
"""
형태소 태깅 모델과 띄어쓰기 모델이 각각 학습하는 히든 레이어
"""
def __init__(self, cfg: Namespace, rsc: Resource, conv_layer_len: int, is_spc: bool):
"""
저장된 모델을 로드하는 메소드
Args:
path (str): 경로
cfg: config
rsc: Resource object
conv_layer_len: convolution 레이어의 n-gram 타입 갯수
is_spc: 띄어쓰기 모델 여부
"""
if torch.cuda.is_available():
state_dict = torch.load(path)
else:
state_dict = torch.load(path, map_location=lambda storage, loc: storage)
self.load_state_dict(state_dict)
if torch.cuda.is_available():
self.cuda()
super().__init__()
setattr(cfg, 'hidden_dim',
(cfg.embed_dim * conv_layer_len + len(rsc.vocab_out)) // 2)
feature_dim = cfg.embed_dim * conv_layer_len
tag_dim = 2 if is_spc else len(rsc.vocab_out)
self.layers = nn.ModuleList([nn.Linear(feature_dim, cfg.hidden_dim),
nn.Linear(cfg.hidden_dim, tag_dim)])

def forward(self, features): # pylint: disable=arguments-differ
# feature => hidden
features_drop = F.dropout(features)
hidden_out = F.relu(self.layers[0](features_drop))
# hidden => tag
hidden_out_drop = F.dropout(hidden_out)
tag_out = self.layers[1](hidden_out_drop)
return tag_out


class CnnModel(PosModel):
class Model(nn.Module):
"""
convolutional neural network based part-of-speech tagger
형태소 태깅 모델, 띄어쓰기 모델
"""
def __init__(self, cfg: Namespace, rsc: Resource):
"""
Args:
cfg (Namespace): config
rsc (Resource): Resource object
cfg: config
rsc: Resource object
"""
super().__init__(cfg, rsc)

ngram = min(5, cfg.window * 2 + 1)
self.convs = nn.ModuleList([nn.Conv1d(cfg.embed_dim, cfg.embed_dim, kernel_size)
for kernel_size in range(2, ngram+1)])

# conv => hidden
setattr(cfg, 'hidden_dim', (cfg.embed_dim * len(self.convs) + len(rsc.vocab_out)) // 2)
self.conv2hidden = nn.Linear(cfg.embed_dim * len(self.convs), cfg.hidden_dim)
super().__init__()
self.cfg = cfg
self.rsc = rsc
self.conv_layer = ConvLayer(cfg, rsc)
self.hidden_layer_pos = HiddenLayer(cfg, rsc, len(self.conv_layer.convs), is_spc=False)
self.hidden_layer_spc = HiddenLayer(cfg, rsc, len(self.conv_layer.convs), is_spc=True)

# hidden => tag
self.hidden2tag = nn.Linear(cfg.hidden_dim, len(rsc.vocab_out))
def forward(self, *inputs):
contexts, left_spc_masks, right_spc_masks = inputs
features_pos = self.conv_layer(contexts, left_spc_masks, right_spc_masks)
features_spc = self.conv_layer(contexts, None, None)
logit_pos = self.hidden_layer_pos(features_pos)
logit_spc = self.hidden_layer_spc(features_spc)
return logit_pos, logit_spc

def forward(self, inputs): # pylint: disable=arguments-differ
def save(self, path: str):
"""
forward path
모델을 저장하는 메소드
Args:
inputs: batch size list of (context, left space mask, right space mask)
Returns:
output score
path: 경로
"""
embeds = self.embedder(inputs)
embeds_t = embeds.transpose(1, 2)

pool_outs = []
for conv in self.convs:
conv_out = F.relu(conv(embeds_t))
pool_outs.append(F.max_pool1d(conv_out, conv_out.size(2)))

# conv => hidden
features = torch.cat([p.view(embeds.size(0), -1) for p in pool_outs], dim=1) # pylint: disable=no-member
features_drop = F.dropout(features)
hidden_out = F.relu(self.conv2hidden(features_drop))
torch.save(self.state_dict(), path)

# hidden => tag
hidden_out_drop = F.dropout(hidden_out)
tag_out = self.hidden2tag(hidden_out_drop)
return tag_out
def load(self, path: str):
"""
저장된 모델을 로드하는 메소드
Args:
path: 경로
conv_layer: convolution layer
"""
state_dict = torch.load(path, map_location=lambda storage, loc: storage)
self.load_state_dict(state_dict)
if torch.cuda.is_available() and self.cfg.gpu_num >= 0:
self.cuda(device=self.cfg.gpu_num)
16 changes: 7 additions & 9 deletions src/main/python/khaiii/train/tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@
import logging
import re

import torch
import torch.nn.functional as F

from khaiii.resource.resource import Resource
from khaiii.train.dataset import PosSentTensor
from khaiii.train.models import CnnModel
from khaiii.train.models import Model


#########
Expand All @@ -41,7 +40,7 @@ def __init__(self, model_dir: str):
for key, val in cfg_dict.items():
setattr(self.cfg, key, val)
self.rsc = Resource(self.cfg)
self.model = CnnModel(self.cfg, self.rsc)
self.model = Model(self.cfg, self.rsc)
self.model.load('{}/model.state'.format(model_dir))
self.model.eval()

Expand All @@ -54,12 +53,11 @@ def tag_raw(self, raw_sent: str, enable_restore: bool = True) -> PosSentTensor:
PosSentTensor object
"""
pos_sent = PosSentTensor(raw_sent)
_, contexts, left_spc_masks, right_spc_masks = pos_sent.to_tensor(self.cfg, self.rsc, False)
if torch.cuda.is_available():
contexts = contexts.cuda()
left_spc_masks = left_spc_masks.cuda()
right_spc_masks = right_spc_masks.cuda()
outputs = self.model((contexts, left_spc_masks, right_spc_masks))
contexts = pos_sent.get_contexts(self.cfg, self.rsc)
left_spc_masks, right_spc_masks = pos_sent.get_spc_masks(self.cfg, self.rsc, False)
outputs, _ = self.model(PosSentTensor.to_tensor(contexts, self.cfg.gpu_num), # pylint: disable=no-member
PosSentTensor.to_tensor(left_spc_masks, self.cfg.gpu_num), # pylint: disable=no-member
PosSentTensor.to_tensor(right_spc_masks, self.cfg.gpu_num)) # pylint: disable=no-member
_, predicts = F.softmax(outputs, dim=1).max(1)
tags = [self.rsc.vocab_out[t.item()] for t in predicts]
pos_sent.set_pos_result(tags, self.rsc.restore_dic if enable_restore else None)
Expand Down
Loading

0 comments on commit 2fb326c

Please sign in to comment.