diff --git a/src/main/python/khaiii/resource/resource.py b/src/main/python/khaiii/resource/resource.py
index 110ee70..4d1099b 100644
--- a/src/main/python/khaiii/resource/resource.py
+++ b/src/main/python/khaiii/resource/resource.py
@@ -23,13 +23,8 @@
#############
# constants #
#############
-SPECIAL_CHARS = [
- '', # unknown character
- '', '' # begin/end of sentence
-]
-
-PAD_CHR = '
' # sepcial character for padding
+UNK_CHR = '@@UNKNOWN@@'
+SPECIAL_CHARS = ['', ] * (window - len(left_context)))
- left_context = list(reversed(left_context[:window]))
- assert len(left_context) == window
-
- right_context = list(word[chr_idx+1:])
- if random.random() >= spc_dropout:
- right_context.append('', ] # 문장 시작
- for word in self.words:
- if len(chars) > 1:
- chars.append('') # 문장 종료
- return chars
+ return labels_tensor, contexts_tensor, left_spc_masks_tensor, right_spc_masks_tensor
class PosDataset:
diff --git a/src/main/python/khaiii/train/embedder.py b/src/main/python/khaiii/train/embedder.py
index 61acf2d..b58cd7b 100644
--- a/src/main/python/khaiii/train/embedder.py
+++ b/src/main/python/khaiii/train/embedder.py
@@ -33,17 +33,22 @@ def __init__(self, cfg: Namespace, rsc: Resource):
super().__init__()
self.cfg = cfg
self.rsc = rsc
- self.embedding = nn.Embedding(len(rsc.vocab_in), cfg.embed_dim)
+ self.embedding = nn.Embedding(len(rsc.vocab_in), cfg.embed_dim, 0)
def forward(self, inputs): # pylint: disable=arguments-differ
"""
임베딩을 생성하는 메소드
Args:
- inputs: contexts of batch size
+ inputs: batch size list of (context, left space mask, right space mask)
Returns:
embedding
"""
- embeds = self.embedding(inputs)
+ contexts, left_spc_masks, right_spc_masks = inputs
+ embeds = self.embedding(contexts)
+ embeds += self.embedding(left_spc_masks)
+ embeds += self.embedding(right_spc_masks)
+ # 왼쪽과 오른쪽 패딩에는 zero 벡터인데 아래 positional encoding이 더해짐
+ # 사소하지만 아래도 패딩 영역에 대해 마스킹 후 더해줘야 하지 않을까?
embeds += positional_encoding(self.cfg.context_len, self.cfg.context_len,
self.cfg.embed_dim, 1)
return embeds
diff --git a/src/main/python/khaiii/train/evaluator.py b/src/main/python/khaiii/train/evaluator.py
index 33f5fe3..3596039 100644
--- a/src/main/python/khaiii/train/evaluator.py
+++ b/src/main/python/khaiii/train/evaluator.py
@@ -38,9 +38,12 @@ def evaluate(self) -> Tuple[float, float, float]:
"""
char_acc = self.cnt['match_chars'] / self.cnt['total_chars']
word_acc = self.cnt['match_words'] / self.cnt['total_words']
- recall = self.cnt['match_morphs'] / self.cnt['total_gold_morphs']
- precision = self.cnt['match_morphs'] / self.cnt['total_pred_morphs']
- f_score = 2.0 * recall * precision / (recall + precision)
+ if self.cnt['match_morphs'] == 0:
+ recall = precision = f_score = 0.0
+ else:
+ recall = self.cnt['match_morphs'] / self.cnt['total_gold_morphs']
+ precision = self.cnt['match_morphs'] / self.cnt['total_pred_morphs']
+ f_score = 2.0 * recall * precision / (recall + precision)
self.cnt.clear()
return char_acc, word_acc, f_score
@@ -104,13 +107,17 @@ def morphs_to_set(cls, morphs: List[PosMorph]) -> set:
def report(self, fout: TextIO):
"""
report recall/precision to file
- :param fout: output file
+ Args:
+ fout: output file
"""
print('word accuracy: %d / %d = %.4f' % (self.cnt['match_words'], self.cnt['total_words'],
self.cnt['match_words'] / self.cnt['total_words']),
file=fout)
- recall = self.cnt['match_morphs'] / self.cnt['total_gold_morphs']
- precision = self.cnt['match_morphs'] / self.cnt['total_pred_morphs']
- f_score = 2.0 * recall * precision / (recall + precision)
+ if self.cnt['match_morphs'] == 0:
+ recall = precision = f_score = 0.0
+ else:
+ recall = self.cnt['match_morphs'] / self.cnt['total_gold_morphs']
+ precision = self.cnt['match_morphs'] / self.cnt['total_pred_morphs']
+ f_score = 2.0 * recall * precision / (recall + precision)
print('f-score / (recall, precision): %.4f / (%.4f, %.4f)' % (f_score, recall, precision),
file=fout)
diff --git a/src/main/python/khaiii/train/models.py b/src/main/python/khaiii/train/models.py
index c1b3222..b4a9695 100644
--- a/src/main/python/khaiii/train/models.py
+++ b/src/main/python/khaiii/train/models.py
@@ -88,15 +88,15 @@ def __init__(self, cfg: Namespace, rsc: Resource):
# hidden => tag
self.hidden2tag = nn.Linear(cfg.hidden_dim, len(rsc.vocab_out))
- def forward(self, contexts): # pylint: disable=arguments-differ
+ def forward(self, inputs): # pylint: disable=arguments-differ
"""
forward path
Args:
- contexts: batch size list of character and context
+ inputs: batch size list of (context, left space mask, right space mask)
Returns:
output score
"""
- embeds = self.embedder(contexts)
+ embeds = self.embedder(inputs)
embeds_t = embeds.transpose(1, 2)
pool_outs = []
@@ -105,7 +105,7 @@ def forward(self, contexts): # pylint: disable=arguments-differ
pool_outs.append(F.max_pool1d(conv_out, conv_out.size(2)))
# conv => hidden
- features = torch.cat([p.view(contexts.size(0), -1) for p in pool_outs], dim=1) # pylint: disable=no-member
+ features = torch.cat([p.view(embeds.size(0), -1) for p in pool_outs], dim=1) # pylint: disable=no-member
features_drop = F.dropout(features)
hidden_out = F.relu(self.conv2hidden(features_drop))
diff --git a/src/main/python/khaiii/train/tagger.py b/src/main/python/khaiii/train/tagger.py
index c0997d8..6e45a1c 100644
--- a/src/main/python/khaiii/train/tagger.py
+++ b/src/main/python/khaiii/train/tagger.py
@@ -54,10 +54,12 @@ def tag_raw(self, raw_sent: str, enable_restore: bool = True) -> PosSentTensor:
PosSentTensor object
"""
pos_sent = PosSentTensor(raw_sent)
- _, contexts = pos_sent.to_tensor(self.cfg, self.rsc, False)
+ _, contexts, left_spc_masks, right_spc_masks = pos_sent.to_tensor(self.cfg, self.rsc, False)
if torch.cuda.is_available():
contexts = contexts.cuda()
- outputs = self.model(contexts)
+ left_spc_masks = left_spc_masks.cuda()
+ right_spc_masks = right_spc_masks.cuda()
+ outputs = self.model((contexts, left_spc_masks, right_spc_masks))
_, predicts = F.softmax(outputs, dim=1).max(1)
tags = [self.rsc.vocab_out[t.item()] for t in predicts]
pos_sent.set_pos_result(tags, self.rsc.restore_dic if enable_restore else None)
diff --git a/src/main/python/khaiii/train/trainer.py b/src/main/python/khaiii/train/trainer.py
index 6bc2f37..d95d0d5 100644
--- a/src/main/python/khaiii/train/trainer.py
+++ b/src/main/python/khaiii/train/trainer.py
@@ -152,7 +152,7 @@ def _restore_prev_train(self):
if not line:
continue
(epoch, loss_train, loss_dev, acc_char, acc_word, f_score, learning_rate) = \
- line.split('\t')
+ line.split('\t')
self.cfg.epoch = int(epoch) + 1
self.cfg.best_epoch = self.cfg.epoch
self.loss_trains.append(float(loss_train))
@@ -184,7 +184,7 @@ def train(self):
self.log_file = open('{}/log.tsv'.format(self.cfg.out_dir), 'at')
self.sum_wrt = SummaryWriter(self.cfg.out_dir)
patience = self.cfg.patience
- for _ in range(1000000):
+ for _ in range(100000):
is_best = self._train_epoch()
if is_best:
patience = self.cfg.patience
@@ -226,13 +226,16 @@ def _train_epoch(self) -> bool:
loss_trains = []
for train_sent in tqdm(self.dataset_train, 'EPOCH[{}]'.format(self.cfg.epoch),
len(self.dataset_train), mininterval=1, ncols=100):
- train_labels, train_contexts = train_sent.to_tensor(self.cfg, self.rsc, True)
+ train_labels, train_contexts, left_spc_masks, right_spc_masks = \
+ train_sent.to_tensor(self.cfg, self.rsc, True)
if torch.cuda.is_available():
train_labels = train_labels.cuda()
train_contexts = train_contexts.cuda()
+ left_spc_masks = left_spc_masks.cuda()
+ right_spc_masks = right_spc_masks.cuda()
self.model.train()
- train_outputs = self.model(train_contexts)
+ train_outputs = self.model((train_contexts, left_spc_masks, right_spc_masks))
batches.append((train_labels, train_outputs))
if sum([batch[0].size(0) for batch in batches]) < self.cfg.batch_size:
continue
@@ -347,11 +350,14 @@ def evaluate(self, is_dev: bool) -> Tuple[float, float, float, float]:
losses = []
for sent in dataset:
# 만약 spc_dropout이 1.0 이상이면 공백을 전혀 쓰지 않는 것이므로 평가 시에도 적용한다.
- labels, contexts = sent.to_tensor(self.cfg, self.rsc, self.cfg.spc_dropout >= 1.0)
+ labels, contexts, left_spc_masks, right_spc_masks = \
+ sent.to_tensor(self.cfg, self.rsc, self.cfg.spc_dropout >= 1.0)
if torch.cuda.is_available():
labels = labels.cuda()
contexts = contexts.cuda()
- outputs = self.model(contexts)
+ left_spc_masks = left_spc_masks.cuda()
+ right_spc_masks = right_spc_masks.cuda()
+ outputs = self.model((contexts, left_spc_masks, right_spc_masks))
loss = self.criterion(outputs, labels)
losses.append(loss.item())
_, predicts = F.softmax(outputs, dim=1).max(1)
@@ -360,4 +366,5 @@ def evaluate(self, is_dev: bool) -> Tuple[float, float, float, float]:
pred_sent.set_pos_result(pred_tags, self.rsc.restore_dic)
self.evaler.count(sent, pred_sent)
avg_loss = sum(losses) / len(losses)
- return (avg_loss, ) + self.evaler.evaluate()
+ char_acc, word_acc, f_score = self.evaler.evaluate()
+ return avg_loss, char_acc, word_acc, f_score