Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sentiment analysis pytorch #1

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
179 changes: 179 additions & 0 deletions sentiment_analysis/pytorch/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import argparse
import warnings
from torchtext import data
from torchtext import datasets
import torch
import tqdm
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np


def IMDB_dataset(use_cuda=True, batch_size=128, max_len=2470):
"""
Pytorch generator for IMDB dataset.
Args:
use_cuda - bool
batch_size - int
max_len - int - max length of the sentence in train.
All smaller sentences will be padded to have length = max_len.
All larger sentences will be cropped.
Returns:
train_iter, test_iter - batch generators
len(TEXT.vocab) - vocabulary size. Necessary for the embedding layer
batch_size
"""
device = "cuda:0" if use_cuda else -1

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did you test it on gpu?

# set up fields
TEXT = data.Field(lower=True, fix_length=max_len, batch_first=True)
LABEL = data.Field(sequential=False)
# make splits for data
train, test = datasets.IMDB.splits(TEXT, LABEL)
# build the vocabulary
TEXT.build_vocab(train)
LABEL.build_vocab(train)
train_iter, test_iter = data.BucketIterator.splits(
(train, test), batch_size=batch_size, device=device)
return train_iter, test_iter, len(TEXT.vocab), batch_size


class ConvNet(nn.Module):
"""
Architecture:
Embedding layer with customizable vocab size and
embeding size,
2d Convolutional layer,
2d Max Pooling layer,
Fully-connected layer
"""
def __init__(self, vocab_size, embedding_size=1024):
super().__init__()
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.features = nn.Sequential(
nn.Conv2d(1, 1, kernel_size=3),
nn.ELU(),
nn.MaxPool2d(kernel_size=3),
)
self.classifier = nn.Linear(279480, 2)

def forward(self, sentences):
embed_layer = nn.Embedding(self.vocab_size, self.embedding_size, sparse=True)
sentences_embedded = embed_layer(sentences)
sentences_embedded = sentences_embedded.unsqueeze(1)
out = self.features(sentences_embedded)
out = out.view(out.size(0), -1)
out = self.classifier(out)
return F.log_softmax(out, dim=-1)


def train(model, optimizer, n_epochs, train_iter, test_iter, vocab_size, batch_size, quality):
"""
Performs training with n_epochs steps.
"""
train_log, train_acc_log = [], []
val_log, val_acc_log = [], []

for epoch in range(n_epochs):
train_loss, train_acc = train_epoch(model, optimizer, train_iter)

val_loss, val_acc = test(model, test_iter, quality)

train_log.extend(train_loss)
train_acc_log.extend(train_acc)

steps = 25000 / batch_size
val_log.append((steps * (epoch + 1), np.mean(val_loss)))
val_acc_log.append((steps * (epoch + 1), np.mean(val_acc)))

plot_history(train_log, val_log)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

plotting will be a performance hit to the training, i don't think it is desirable in benchmark... correct me if i'm wrong.

plot_history(train_acc_log, val_acc_log, title='accuracy')
print("Final error: {:.2%}".format(1 - val_acc_log[-1][1]))

def train_epoch(model, optimizer, train_iter):
loss_log, acc_log = [], []
model.train()
for batch in tqdm.tqdm(train_iter):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just tqdm(train_iter)?

data = Variable(batch.text)
target = Variable(batch.label) - 1
optimizer.zero_grad()
output = model(data)
pred = torch.max(output, 1)[1].data.numpy()
acc = np.mean(pred == target.data.numpy())
acc_log.append(acc)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
loss = loss.data[0]
loss_log.append(loss)
return loss_log, acc_log

def test(model, test_iter, quality):
loss_log, acc_log = [], []
model.eval()
for batch in tqdm.tqdm(test_iter):
data = Variable(batch.text)
target = Variable(batch.label) - 1
output = model(data)
loss = F.nll_loss(output, target)
pred = torch.max(output, 1)[1].data.numpy()
acc = np.mean(pred == target.data.numpy())
#if acc >= quality:
# break
acc_log.append(acc)
loss = loss.data[0]
loss_log.append(loss)
return loss_log, acc_log

def plot_history(train_history, val_history, title='loss'):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need this?

plt.figure()
plt.title('{}'.format(title))
plt.plot(train_history, label='train', zorder=1)
points = np.array(val_history)
plt.scatter(points[:, 0], points[:, 1], marker='+', s=180, c='orange', label='val', zorder=2)
plt.xlabel('train steps')
plt.legend(loc='best')
plt.grid()
plt.show()



def main(use_cuda, seed, quality):
if use_cuda and not torch.cuda.is_available():
warnings.warn("CUDA device is not accessible! Setting use_cuda to False.")
use_cuda = False

train_iter, test_iter, vocab_size, batch_size = IMDB_dataset(use_cuda)

model = ConvNet(vocab_size)
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)
n_epochs = 3


train(model, optimizer, n_epochs, train_iter, test_iter, vocab_size, batch_size, quality)



if __name__ == '__main__':

parser = argparse.ArgumentParser()

# Parse arguments
parser.add_argument('-m', '--model', nargs='?', required=False,
choices=['conv', 'lstm'], default='conv',
help="Model type for sentiment analysis")
parser.add_argument('-q', '--target_quality', type=float, required=False,
default=90.6,
help="Target validation quality to stop training")
parser.add_argument('-s', '--seed', type=int, required=False, default=1,
help="Seed for random number generator")

args = parser.parse_args()

print (args)

main(use_cuda=True, # Runs on CPU if "False"
seed=args.seed,
quality=args.target_quality)