diff --git a/src/main/python/khaiii/resource/resource.py b/src/main/python/khaiii/resource/resource.py index 110ee70..4d1099b 100644 --- a/src/main/python/khaiii/resource/resource.py +++ b/src/main/python/khaiii/resource/resource.py @@ -23,13 +23,8 @@ ############# # constants # ############# -SPECIAL_CHARS = [ - '', # unknown character - '', '', # begin/end of word - '', '' # begin/end of sentence -] - -PAD_CHR = '

' # sepcial character for padding +UNK_CHR = '@@UNKNOWN@@' +SPECIAL_CHARS = ['', ''] # begin/end of word ######### @@ -45,9 +40,9 @@ def __init__(self, cfg: Namespace): cfg: config """ vocab_in_path = '{}/vocab.in'.format(cfg.rsc_src) - self.vocab_in = Vocabulary(vocab_in_path, cfg.cutoff, SPECIAL_CHARS) + self.vocab_in = Vocabulary(vocab_in_path, cfg.cutoff, UNK_CHR, SPECIAL_CHARS) vocab_out_path = '{}/vocab.out'.format(cfg.rsc_src) - self.vocab_out = Vocabulary(vocab_out_path, 0, None) + self.vocab_out = Vocabulary(vocab_out_path) # no unknown, no special restore_dic_path = '{}/restore.dic'.format(cfg.rsc_src) self.restore_dic = self._load_restore_dic(restore_dic_path) diff --git a/src/main/python/khaiii/resource/vocabulary.py b/src/main/python/khaiii/resource/vocabulary.py index eb7a67c..aa492b0 100644 --- a/src/main/python/khaiii/resource/vocabulary.py +++ b/src/main/python/khaiii/resource/vocabulary.py @@ -11,7 +11,6 @@ ########### # imports # ########### -import copy import logging import os from typing import List @@ -24,26 +23,25 @@ class Vocabulary: """ vocabulary class """ - def __init__(self, path: str, cutoff: int = 1, special: List[str] = None, padding: str = ''): + def __init__(self, path: str, cutoff: int = 1, unk: str = '', special: List[str] = None): """ + padding index is always 0. None and '' get padding index. + if `unk` is given (such as input vocab), its index is always 1. + if `unk` is not given (such as output vocab), an exception will be thrown for unknown entry Args: path: file path cutoff: cutoff frequency + unk: unknown(OOV) entry special: special entries located at the first - padding: add padding special char at the end """ self.dic = {} # {entry: number} dictionary - self.rev = copy.deepcopy(special) if special else [] # reverse dictionary + self.unk = unk + self.rev = ['', unk] if unk else ['', ] # reverse dictionary + if special: + self.rev.extend(special) for num, entry in enumerate(self.rev): self.dic[entry] = num self._load(path, cutoff) - self.padding = padding - if padding: - if padding in self.dic: - raise ValueError('padding special character already in vocab: {}'.format(padding)) - padding_idx = len(self.dic) - self.dic[padding] = padding_idx - self.rev.append(padding) assert len(self.dic) == len(self.rev) def __getitem__(self, key): @@ -57,22 +55,14 @@ def __getitem__(self, key): return self.rev[key] try: return self.dic[key] - except KeyError: - return 0 # unknown word number + except KeyError as key_err: + if self.unk: + return self.dic[self.unk] + raise key_err def __len__(self): return len(self.dic) - def padding_idx(self) -> int: - """ - 맨 마지막에 추가한 패딩의 인덱스를 리턴한다. - Returns: - 패딩 인덱스 - """ - if not self.padding: - raise RuntimeError('vocabulary has no padding') - return self.dic[self.padding] - def _load(self, path: str, cutoff: int = 1): """ load vocabulary from file diff --git a/src/main/python/khaiii/train/dataset.py b/src/main/python/khaiii/train/dataset.py index 7aee785..a1b1612 100644 --- a/src/main/python/khaiii/train/dataset.py +++ b/src/main/python/khaiii/train/dataset.py @@ -12,6 +12,7 @@ # imports # ########### from argparse import Namespace +import itertools import logging import os import random @@ -20,7 +21,7 @@ from torch import LongTensor, Tensor # pylint: disable=no-member, no-name-in-module from tqdm import tqdm -from khaiii.resource.resource import PAD_CHR, Resource +from khaiii.resource.resource import Resource from khaiii.train.sentence import PosSentence, PosWord @@ -44,97 +45,129 @@ def __len__(self): return sum([len(w.raw) for w in self.pos_tagged_words]) + len(self.pos_tagged_words) + 1 return 0 - def make_labels(self, with_spc: bool) -> List[str]: + def make_contexts(self, window: int) -> List[List[str]]: """ - 각 음절별로 출력 레이블(태그)를 생성한다. + 각 음절 별로 좌/우 window 크기 만큼 context를 만든다. Args: - with_spc: 공백(어절 경계) 포함 여부 + window: left/right window size Returns: - 레이블 리스트 + contexts """ - if not with_spc: - # 문장 경계, 어절 경계 등 가상 음절을 제외하고 순수한 음절들의 레이블 - return [tag for pos_word in self.pos_tagged_words for tag in pos_word.tags] - labels = [PAD_CHR, ] # 문장 시작 - for pos_word in self.pos_tagged_words: - if len(labels) > 1: - labels.append(PAD_CHR) # 어절 경계 - labels.extend(pos_word.tags) - labels.append(PAD_CHR) # 문장 종료 - return labels - - def make_contexts(self, window: int, spc_dropout: float) -> List[str]: + chars = [c for w in self.words for c in w] + chars_len = len(chars) + chars_padded = ['', ] * window + chars + ['', ] * window + contexts = [chars_padded[idx-window:idx+window+1] + for idx in range(window, chars_len + window)] + return contexts + + @classmethod + def _flatten(cls, list_of_lists): + """ + flatten one level of nesting + Args: + list_of_lists: list of lists + Returns: + flattened list + """ + return list(itertools.chain.from_iterable(list_of_lists)) + + def make_left_spc_masks(self, window: int, left_vocab_id: int, spc_dropout: float) \ + -> List[List[int]]: """ 각 음절 별로 좌/우 window 크기 만큼 context를 만든다. Args: window: left/right window size - spc_dropout: space(word delimiter) dropout rate + left_vocab_id: vocabulary ID for '' + spc_dropout: space dropout rate Returns: - contexts + left space masks """ - contexts = [] - for wrd_idx, word in enumerate(self.words): - for chr_idx, char in enumerate(word): - left_context = list(reversed(word[:chr_idx])) - if random.random() >= spc_dropout: - left_context.append('') - for left_word in reversed(self.words[:wrd_idx]): - left_context.extend(reversed(left_word)) - if len(left_context) >= window: - break - if len(left_context) < window: - left_context.extend(['', ] * (window - len(left_context))) - left_context = list(reversed(left_context[:window])) - assert len(left_context) == window - - right_context = list(word[chr_idx+1:]) - if random.random() >= spc_dropout: - right_context.append('') - for right_word in self.words[wrd_idx+1:]: - right_context.extend(list(right_word)) - if len(right_context) >= window: - break - if len(right_context) < window: - right_context.extend(['', ] * (window - len(right_context))) - right_context = right_context[:window] - assert len(right_context) == window - contexts.append(left_context + [char, ] + right_context) - return contexts - - def to_tensor(self, cfg: Namespace, rsc: Resource, is_train: bool) -> Tuple[Tensor, Tensor]: + def _filter_left_spc_mask(left_spc_mask): + """ + 중심 음절로부터 첫번째 왼쪽 공백만 남기고 나머지는 제거한다. + Args: + left_spc_mask: 왼쪽 공백 마스크 + """ + for idx in range(window, -1, -1): + if left_spc_mask[idx] == left_vocab_id: + if random.random() < spc_dropout: + left_spc_mask[idx] = 0 + for jdx in range(idx-1, -1, -1): + left_spc_mask[jdx] = 0 + break + + left_spcs = self._flatten([[left_vocab_id, ] + [0, ] * (len(word)-1) + for word in self.words]) + left_padded = [0, ] * window + left_spcs + [0, ] * window + left_spc_masks = [left_padded[idx-window:idx+1] + [0, ] * window + for idx in range(window, len(left_spcs) + window)] + for left_spc_mask in left_spc_masks: + _filter_left_spc_mask(left_spc_mask) + return left_spc_masks + + def make_right_spc_masks(self, window: int, right_vocab_id: int, spc_dropout: float) \ + -> List[List[int]]: + """ + 각 음절 별로 좌/우 window 크기 만큼 context를 만든다. + Args: + window: left/right window size + right_vocab_id: vocabulary ID for '' + spc_dropout: space dropout rate + Returns: + right space masks + """ + def _filter_right_spc_mask(right_spc_mask): + """ + 중심 음절로부터 첫번째 오른쪽 공백만 남기고 나머지는 제거한다. + Args: + right_spc_mask: 오른쪽 공백 마스크 + """ + for idx in range(window, len(right_spc_mask)): + if right_spc_mask[idx] == right_vocab_id: + if random.random() < spc_dropout: + right_spc_mask[idx] = 0 + for jdx in range(idx+1, len(right_spc_mask)): + right_spc_mask[jdx] = 0 + break + + right_spcs = self._flatten([[0, ] * (len(word)-1) + [right_vocab_id, ] + for word in self.words]) + right_padded = [0, ] * window + right_spcs + [0, ] * window + right_spc_masks = [[0, ] * window + right_padded[idx:idx+window+1] + for idx in range(window, len(right_spcs) + window)] + for right_spc_mask in right_spc_masks: + _filter_right_spc_mask(right_spc_mask) + return right_spc_masks + + def to_tensor(self, cfg: Namespace, rsc: Resource, do_spc_dropout: bool) \ + -> Tuple[Tensor, Tensor, Tensor, Tensor]: """ 문장 내에 포함된 전체 음절들과 태그를 모델의 forward 메소드에 넣을 수 있는 텐서로 변환한다. Args: cfg: config rsc: Resource object - is_train: whether is train or not + do_spc_dropout: whether do space dropout or not Returns: labels tensor contexts tensor + left space masks tensor + right space masks tensor """ # 차원: [문장내 음절 갯수, ] - label_nums = [rsc.vocab_out[l] for l in self.make_labels(False)] + label_nums = [rsc.vocab_out[tag] for pos_word in self.pos_tagged_words \ + for tag in pos_word.tags] labels_tensor = LongTensor(label_nums) # 차원: [문장내 음절 갯수 x context 크기] - spc_dropout = cfg.spc_dropout if is_train else 0.0 - context_nums = [[rsc.vocab_in[c] for c in context] \ - for context in self.make_contexts(cfg.window, spc_dropout)] + contexts = self.make_contexts(cfg.window) + context_nums = [[rsc.vocab_in[c] for c in context] for context in contexts] contexts_tensor = LongTensor(context_nums) - return labels_tensor, contexts_tensor + spc_dropout = cfg.spc_dropout if do_spc_dropout else 0.0 + left_spc_masks = self.make_left_spc_masks(cfg.window, rsc.vocab_in[''], spc_dropout) + left_spc_masks_tensor = LongTensor(left_spc_masks) + right_spc_masks = self.make_right_spc_masks(cfg.window, rsc.vocab_in[''], spc_dropout) + right_spc_masks_tensor = LongTensor(right_spc_masks) - def make_chars(self) -> List[str]: - """ - 문장 내 포함된 음절들을 만든다. 문장 경계 및 어절 경계를 포함한다. - Returns: - 음절의 리스트 - """ - chars = ['', ] # 문장 시작 - for word in self.words: - if len(chars) > 1: - chars.append('') # 어절 경계 - chars.extend(word) - chars.append('') # 문장 종료 - return chars + return labels_tensor, contexts_tensor, left_spc_masks_tensor, right_spc_masks_tensor class PosDataset: diff --git a/src/main/python/khaiii/train/embedder.py b/src/main/python/khaiii/train/embedder.py index 61acf2d..b58cd7b 100644 --- a/src/main/python/khaiii/train/embedder.py +++ b/src/main/python/khaiii/train/embedder.py @@ -33,17 +33,22 @@ def __init__(self, cfg: Namespace, rsc: Resource): super().__init__() self.cfg = cfg self.rsc = rsc - self.embedding = nn.Embedding(len(rsc.vocab_in), cfg.embed_dim) + self.embedding = nn.Embedding(len(rsc.vocab_in), cfg.embed_dim, 0) def forward(self, inputs): # pylint: disable=arguments-differ """ 임베딩을 생성하는 메소드 Args: - inputs: contexts of batch size + inputs: batch size list of (context, left space mask, right space mask) Returns: embedding """ - embeds = self.embedding(inputs) + contexts, left_spc_masks, right_spc_masks = inputs + embeds = self.embedding(contexts) + embeds += self.embedding(left_spc_masks) + embeds += self.embedding(right_spc_masks) + # 왼쪽과 오른쪽 패딩에는 zero 벡터인데 아래 positional encoding이 더해짐 + # 사소하지만 아래도 패딩 영역에 대해 마스킹 후 더해줘야 하지 않을까? embeds += positional_encoding(self.cfg.context_len, self.cfg.context_len, self.cfg.embed_dim, 1) return embeds diff --git a/src/main/python/khaiii/train/evaluator.py b/src/main/python/khaiii/train/evaluator.py index 33f5fe3..3596039 100644 --- a/src/main/python/khaiii/train/evaluator.py +++ b/src/main/python/khaiii/train/evaluator.py @@ -38,9 +38,12 @@ def evaluate(self) -> Tuple[float, float, float]: """ char_acc = self.cnt['match_chars'] / self.cnt['total_chars'] word_acc = self.cnt['match_words'] / self.cnt['total_words'] - recall = self.cnt['match_morphs'] / self.cnt['total_gold_morphs'] - precision = self.cnt['match_morphs'] / self.cnt['total_pred_morphs'] - f_score = 2.0 * recall * precision / (recall + precision) + if self.cnt['match_morphs'] == 0: + recall = precision = f_score = 0.0 + else: + recall = self.cnt['match_morphs'] / self.cnt['total_gold_morphs'] + precision = self.cnt['match_morphs'] / self.cnt['total_pred_morphs'] + f_score = 2.0 * recall * precision / (recall + precision) self.cnt.clear() return char_acc, word_acc, f_score @@ -104,13 +107,17 @@ def morphs_to_set(cls, morphs: List[PosMorph]) -> set: def report(self, fout: TextIO): """ report recall/precision to file - :param fout: output file + Args: + fout: output file """ print('word accuracy: %d / %d = %.4f' % (self.cnt['match_words'], self.cnt['total_words'], self.cnt['match_words'] / self.cnt['total_words']), file=fout) - recall = self.cnt['match_morphs'] / self.cnt['total_gold_morphs'] - precision = self.cnt['match_morphs'] / self.cnt['total_pred_morphs'] - f_score = 2.0 * recall * precision / (recall + precision) + if self.cnt['match_morphs'] == 0: + recall = precision = f_score = 0.0 + else: + recall = self.cnt['match_morphs'] / self.cnt['total_gold_morphs'] + precision = self.cnt['match_morphs'] / self.cnt['total_pred_morphs'] + f_score = 2.0 * recall * precision / (recall + precision) print('f-score / (recall, precision): %.4f / (%.4f, %.4f)' % (f_score, recall, precision), file=fout) diff --git a/src/main/python/khaiii/train/models.py b/src/main/python/khaiii/train/models.py index c1b3222..b4a9695 100644 --- a/src/main/python/khaiii/train/models.py +++ b/src/main/python/khaiii/train/models.py @@ -88,15 +88,15 @@ def __init__(self, cfg: Namespace, rsc: Resource): # hidden => tag self.hidden2tag = nn.Linear(cfg.hidden_dim, len(rsc.vocab_out)) - def forward(self, contexts): # pylint: disable=arguments-differ + def forward(self, inputs): # pylint: disable=arguments-differ """ forward path Args: - contexts: batch size list of character and context + inputs: batch size list of (context, left space mask, right space mask) Returns: output score """ - embeds = self.embedder(contexts) + embeds = self.embedder(inputs) embeds_t = embeds.transpose(1, 2) pool_outs = [] @@ -105,7 +105,7 @@ def forward(self, contexts): # pylint: disable=arguments-differ pool_outs.append(F.max_pool1d(conv_out, conv_out.size(2))) # conv => hidden - features = torch.cat([p.view(contexts.size(0), -1) for p in pool_outs], dim=1) # pylint: disable=no-member + features = torch.cat([p.view(embeds.size(0), -1) for p in pool_outs], dim=1) # pylint: disable=no-member features_drop = F.dropout(features) hidden_out = F.relu(self.conv2hidden(features_drop)) diff --git a/src/main/python/khaiii/train/tagger.py b/src/main/python/khaiii/train/tagger.py index c0997d8..6e45a1c 100644 --- a/src/main/python/khaiii/train/tagger.py +++ b/src/main/python/khaiii/train/tagger.py @@ -54,10 +54,12 @@ def tag_raw(self, raw_sent: str, enable_restore: bool = True) -> PosSentTensor: PosSentTensor object """ pos_sent = PosSentTensor(raw_sent) - _, contexts = pos_sent.to_tensor(self.cfg, self.rsc, False) + _, contexts, left_spc_masks, right_spc_masks = pos_sent.to_tensor(self.cfg, self.rsc, False) if torch.cuda.is_available(): contexts = contexts.cuda() - outputs = self.model(contexts) + left_spc_masks = left_spc_masks.cuda() + right_spc_masks = right_spc_masks.cuda() + outputs = self.model((contexts, left_spc_masks, right_spc_masks)) _, predicts = F.softmax(outputs, dim=1).max(1) tags = [self.rsc.vocab_out[t.item()] for t in predicts] pos_sent.set_pos_result(tags, self.rsc.restore_dic if enable_restore else None) diff --git a/src/main/python/khaiii/train/trainer.py b/src/main/python/khaiii/train/trainer.py index 6bc2f37..d95d0d5 100644 --- a/src/main/python/khaiii/train/trainer.py +++ b/src/main/python/khaiii/train/trainer.py @@ -152,7 +152,7 @@ def _restore_prev_train(self): if not line: continue (epoch, loss_train, loss_dev, acc_char, acc_word, f_score, learning_rate) = \ - line.split('\t') + line.split('\t') self.cfg.epoch = int(epoch) + 1 self.cfg.best_epoch = self.cfg.epoch self.loss_trains.append(float(loss_train)) @@ -184,7 +184,7 @@ def train(self): self.log_file = open('{}/log.tsv'.format(self.cfg.out_dir), 'at') self.sum_wrt = SummaryWriter(self.cfg.out_dir) patience = self.cfg.patience - for _ in range(1000000): + for _ in range(100000): is_best = self._train_epoch() if is_best: patience = self.cfg.patience @@ -226,13 +226,16 @@ def _train_epoch(self) -> bool: loss_trains = [] for train_sent in tqdm(self.dataset_train, 'EPOCH[{}]'.format(self.cfg.epoch), len(self.dataset_train), mininterval=1, ncols=100): - train_labels, train_contexts = train_sent.to_tensor(self.cfg, self.rsc, True) + train_labels, train_contexts, left_spc_masks, right_spc_masks = \ + train_sent.to_tensor(self.cfg, self.rsc, True) if torch.cuda.is_available(): train_labels = train_labels.cuda() train_contexts = train_contexts.cuda() + left_spc_masks = left_spc_masks.cuda() + right_spc_masks = right_spc_masks.cuda() self.model.train() - train_outputs = self.model(train_contexts) + train_outputs = self.model((train_contexts, left_spc_masks, right_spc_masks)) batches.append((train_labels, train_outputs)) if sum([batch[0].size(0) for batch in batches]) < self.cfg.batch_size: continue @@ -347,11 +350,14 @@ def evaluate(self, is_dev: bool) -> Tuple[float, float, float, float]: losses = [] for sent in dataset: # 만약 spc_dropout이 1.0 이상이면 공백을 전혀 쓰지 않는 것이므로 평가 시에도 적용한다. - labels, contexts = sent.to_tensor(self.cfg, self.rsc, self.cfg.spc_dropout >= 1.0) + labels, contexts, left_spc_masks, right_spc_masks = \ + sent.to_tensor(self.cfg, self.rsc, self.cfg.spc_dropout >= 1.0) if torch.cuda.is_available(): labels = labels.cuda() contexts = contexts.cuda() - outputs = self.model(contexts) + left_spc_masks = left_spc_masks.cuda() + right_spc_masks = right_spc_masks.cuda() + outputs = self.model((contexts, left_spc_masks, right_spc_masks)) loss = self.criterion(outputs, labels) losses.append(loss.item()) _, predicts = F.softmax(outputs, dim=1).max(1) @@ -360,4 +366,5 @@ def evaluate(self, is_dev: bool) -> Tuple[float, float, float, float]: pred_sent.set_pos_result(pred_tags, self.rsc.restore_dic) self.evaler.count(sent, pred_sent) avg_loss = sum(losses) / len(losses) - return (avg_loss, ) + self.evaler.evaluate() + char_acc, word_acc, f_score = self.evaler.evaluate() + return avg_loss, char_acc, word_acc, f_score