-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Sentiment analysis pytorch #1
base: master
Are you sure you want to change the base?
Changes from 4 commits
8498b59
ab514ed
7296acc
a5e49dd
c9aea94
7765ec8
2dc5c0a
a4b9a34
fea2e63
7240607
0fdf2e4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
import argparse | ||
import warnings | ||
from torchtext import data | ||
from torchtext import datasets | ||
import torch | ||
import tqdm | ||
import torch.nn as nn | ||
from torch.autograd import Variable | ||
import torch.nn.functional as F | ||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
|
||
|
||
def IMDB_dataset(use_cuda=True, batch_size=128, max_len=2470): | ||
""" | ||
Pytorch generator for IMDB dataset. | ||
Args: | ||
use_cuda - bool | ||
batch_size - int | ||
max_len - int - max length of the sentence in train. | ||
All smaller sentences will be padded to have length = max_len. | ||
All larger sentences will be cropped. | ||
Returns: | ||
train_iter, test_iter - batch generators | ||
len(TEXT.vocab) - vocabulary size. Necessary for the embedding layer | ||
batch_size | ||
""" | ||
device = "cuda:0" if use_cuda else -1 | ||
# set up fields | ||
TEXT = data.Field(lower=True, fix_length=max_len, batch_first=True) | ||
LABEL = data.Field(sequential=False) | ||
# make splits for data | ||
train, test = datasets.IMDB.splits(TEXT, LABEL) | ||
# build the vocabulary | ||
TEXT.build_vocab(train) | ||
LABEL.build_vocab(train) | ||
train_iter, test_iter = data.BucketIterator.splits( | ||
(train, test), batch_size=batch_size, device=device) | ||
return train_iter, test_iter, len(TEXT.vocab), batch_size | ||
|
||
|
||
class ConvNet(nn.Module): | ||
""" | ||
Architecture: | ||
Embedding layer with customizable vocab size and | ||
embeding size, | ||
2d Convolutional layer, | ||
2d Max Pooling layer, | ||
Fully-connected layer | ||
""" | ||
def __init__(self, vocab_size, embedding_size=1024): | ||
super().__init__() | ||
self.vocab_size = vocab_size | ||
self.embedding_size = embedding_size | ||
self.features = nn.Sequential( | ||
nn.Conv2d(1, 1, kernel_size=3), | ||
nn.ELU(), | ||
nn.MaxPool2d(kernel_size=3), | ||
) | ||
self.classifier = nn.Linear(279480, 2) | ||
|
||
def forward(self, sentences): | ||
embed_layer = nn.Embedding(self.vocab_size, self.embedding_size, sparse=True) | ||
sentences_embedded = embed_layer(sentences) | ||
sentences_embedded = sentences_embedded.unsqueeze(1) | ||
out = self.features(sentences_embedded) | ||
out = out.view(out.size(0), -1) | ||
out = self.classifier(out) | ||
return F.log_softmax(out, dim=-1) | ||
|
||
|
||
def train(model, optimizer, n_epochs, train_iter, test_iter, vocab_size, batch_size, quality): | ||
""" | ||
Performs training with n_epochs steps. | ||
""" | ||
train_log, train_acc_log = [], [] | ||
val_log, val_acc_log = [], [] | ||
|
||
for epoch in range(n_epochs): | ||
train_loss, train_acc = train_epoch(model, optimizer, train_iter) | ||
|
||
val_loss, val_acc = test(model, test_iter, quality) | ||
|
||
train_log.extend(train_loss) | ||
train_acc_log.extend(train_acc) | ||
|
||
steps = 25000 / batch_size | ||
val_log.append((steps * (epoch + 1), np.mean(val_loss))) | ||
val_acc_log.append((steps * (epoch + 1), np.mean(val_acc))) | ||
|
||
plot_history(train_log, val_log) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. plotting will be a performance hit to the training, i don't think it is desirable in benchmark... correct me if i'm wrong. |
||
plot_history(train_acc_log, val_acc_log, title='accuracy') | ||
print("Final error: {:.2%}".format(1 - val_acc_log[-1][1])) | ||
|
||
def train_epoch(model, optimizer, train_iter): | ||
loss_log, acc_log = [], [] | ||
model.train() | ||
for batch in tqdm.tqdm(train_iter): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just |
||
data = Variable(batch.text) | ||
target = Variable(batch.label) - 1 | ||
optimizer.zero_grad() | ||
output = model(data) | ||
pred = torch.max(output, 1)[1].data.numpy() | ||
acc = np.mean(pred == target.data.numpy()) | ||
acc_log.append(acc) | ||
loss = F.nll_loss(output, target) | ||
loss.backward() | ||
optimizer.step() | ||
loss = loss.data[0] | ||
loss_log.append(loss) | ||
return loss_log, acc_log | ||
|
||
def test(model, test_iter, quality): | ||
loss_log, acc_log = [], [] | ||
model.eval() | ||
for batch in tqdm.tqdm(test_iter): | ||
data = Variable(batch.text) | ||
target = Variable(batch.label) - 1 | ||
output = model(data) | ||
loss = F.nll_loss(output, target) | ||
pred = torch.max(output, 1)[1].data.numpy() | ||
acc = np.mean(pred == target.data.numpy()) | ||
#if acc >= quality: | ||
# break | ||
acc_log.append(acc) | ||
loss = loss.data[0] | ||
loss_log.append(loss) | ||
return loss_log, acc_log | ||
|
||
def plot_history(train_history, val_history, title='loss'): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we need this? |
||
plt.figure() | ||
plt.title('{}'.format(title)) | ||
plt.plot(train_history, label='train', zorder=1) | ||
points = np.array(val_history) | ||
plt.scatter(points[:, 0], points[:, 1], marker='+', s=180, c='orange', label='val', zorder=2) | ||
plt.xlabel('train steps') | ||
plt.legend(loc='best') | ||
plt.grid() | ||
plt.show() | ||
|
||
|
||
|
||
def main(use_cuda, seed, quality): | ||
if use_cuda and not torch.cuda.is_available(): | ||
warnings.warn("CUDA device is not accessible! Setting use_cuda to False.") | ||
use_cuda = False | ||
|
||
train_iter, test_iter, vocab_size, batch_size = IMDB_dataset(use_cuda) | ||
|
||
model = ConvNet(vocab_size) | ||
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001) | ||
n_epochs = 3 | ||
|
||
|
||
train(model, optimizer, n_epochs, train_iter, test_iter, vocab_size, batch_size, quality) | ||
|
||
|
||
|
||
if __name__ == '__main__': | ||
|
||
parser = argparse.ArgumentParser() | ||
|
||
# Parse arguments | ||
parser.add_argument('-m', '--model', nargs='?', required=False, | ||
choices=['conv', 'lstm'], default='conv', | ||
help="Model type for sentiment analysis") | ||
parser.add_argument('-q', '--target_quality', type=float, required=False, | ||
default=90.6, | ||
help="Target validation quality to stop training") | ||
parser.add_argument('-s', '--seed', type=int, required=False, default=1, | ||
help="Seed for random number generator") | ||
|
||
args = parser.parse_args() | ||
|
||
print (args) | ||
|
||
main(use_cuda=True, # Runs on CPU if "False" | ||
seed=args.seed, | ||
quality=args.target_quality) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
did you test it on gpu?