-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathinference.py
executable file
·139 lines (114 loc) · 5.42 KB
/
inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import argparse
import torch
from torch.utils.data import DataLoader
from tokenization import Tokenizer, Vocab
from dataset_utils import Corpus
from models import LSTMLM, BiLSTMLM
def argparser():
p = argparse.ArgumentParser()
# Required parameters
p.add_argument('--corpus', default=None, type=str, required=True)
p.add_argument('--vocab', default=None, type=str, required=True)
p.add_argument('--model', default=None, type=str, required=True)
p.add_argument('--model_type', default=None, type=str, required=True,
help='Model type selected in the list: LSTM')
# Input parameters
p.add_argument('--is_tokenized', action='store_true',
help='Whetehr the corpus is already tokenized')
p.add_argument('--tokenizer', default='mecab', type=str,
help='Tokenizer used for input corpus tokenization')
p.add_argument('--max_seq_len', default=32, type=int,
help='The maximum total input sequence length after tokenization')
# Inference parameters
p.add_argument('--multi_gpu', action='store_true',
help='Whether to inference with multiple GPU')
p.add_argument('--cuda', default= True, type=bool,
help='Whether CUDA is cureently available')
p.add_argument('--batch_size', default=16, type=int,
help='Batch size for inference')
# Model parameters
p.add_argument('--embedding_size', default=256, type=int,
help='Word embedding vector dimension')
p.add_argument('--hidden_size', default=1024, type=int,
help='Hidden size of LSTM')
p.add_argument('--n_layers', default=3, type=int,
help='Number of layers in LSTM')
p.add_argument('--dropout_p', default=.2, type=float,
help='Dropout rate used for dropout layer in LSTM')
config = p.parse_args()
return config
def sentence_from_indexes(indexes):
# |indexes| = (max_seq_len-1)
# Convert indexes to tokens
tokens = tokenizer.inverse_transform(indexes.tolist())
# |tokens| = (max_seq_len-1)
try:
# Return tokens up to eos_token
first_eos_token_index = tokens.index(vocab.eos_token)
return ' '.join(tokens[:first_eos_token_index])
except ValueError:
# Only if eos_token is not in the token list
return ' '.join(tokens)
def inference():
model.eval()
with torch.no_grad():
for iter_, batch in enumerate(loader):
inputs, targets = batch
# |inputs|, |targets| = (batch_size, seq_len), (batch_size, seq_len)
preds = model(inputs)
# |preds| = (batch_size, max_seq_len-1, len(vocab))
if config.multi_gpu:
# If the model run parallelly using DataParallelModel,the output tensor size is as follows.
# |preds| = [(batch_size/n_gpus, max_seq_len-1, len(vocab))] * n_gpus
# So, concatenate tensors split by multi-gpu usage
preds = torch.cat([pred for pred in preds], dim=0)
# |preds| = (batch_size, max_seq_len-1, len(vocab))
# Returns the largest element of the predictions
topv, topi = torch.topk(preds, 1)
# |topv|, |topi| = (batch_size, max_seq_len-1, 1)
# Convert indexes to sentences
for i, (each_topi, target) in enumerate(zip(topi, targets)):
target_sentences = sentence_from_indexes(target)
pred_sentences = sentence_from_indexes(each_topi.squeeze(-1))
print('#{} =============='.format(iter_*config.batch_size + i))
print('Actu:\t{}\nPred:\t{}\n'.format(target_sentences, pred_sentences))
if __name__=='__main__':
config = argparser()
print(config)
# Load vocabulary
import pickle
with open(config.vocab, 'rb') as reader:
vocab = pickle.load(reader)
# Select tokenizer
if config.tokenizer=='mecab':
from konlpy.tag import Mecab
tokenizer = Tokenizer(tokenization_fn=Mecab().morphs,
vocab=vocab, max_seq_length=config.max_seq_len)
# Build dataloader
corpus = Corpus(corpus_path=config.corpus, tokenizer=tokenizer, model_type=config.model_type, cuda=config.cuda)
loader = DataLoader(dataset=corpus, batch_size=config.batch_size)
# Load model with trained parameters
if config.model_type=='LSTM':
model = LSTMLM(input_size=len(vocab),
embedding_size=config.embedding_size,
hidden_size=config.hidden_size,
output_size=len(vocab),
n_layers=config.n_layers,
dropout_p=config.dropout_p)
elif config.model_type=='BiLSTM':
model = BiLSTMLM(input_size=len(vocab),
embedding_size=config.embedding_size,
hidden_size=config.hidden_size,
output_size=len(vocab),
n_layers=config.n_layers,
dropout_p=config.dropout_p)
if config.cuda:
if config.multi_gpu:
from parallel import DataParallelModel
model = DataParallelModel(model).cuda()
else:
model = model.cuda()
model.load_state_dict(torch.load(config.model))
print('=========MODEL=========\n',model)
# Inference
inference()