diff --git a/sentiment_analysis/pytorch/README.md b/sentiment_analysis/pytorch/README.md new file mode 100644 index 000000000..53cdb867c --- /dev/null +++ b/sentiment_analysis/pytorch/README.md @@ -0,0 +1,108 @@ +# Architecture. + +Achitecture mimics the original one impelented in PaddlePaddle: +https://github.com/mlperf/reference/blob/master/sentiment_analysis/paddle/train.py + +## Brief overview: +1. Embedding layer +2. Two independent conv + relu + max pool layers with 3x3 and 4x4 filters +3. Fully-connected + softmax + +### 1. Embedding layer + +Embedding layer is nn.Embedding layer with +self.vocab_size and self.embedding_size +self.vocab_size is the value computed by IMDB_dataset() function. +Initial setup: +self.embedding_size = 1024 + + +### 2. Two independent conv + relu + max pool layers with 3x3 and 4x4 filters. + +### 3. Fully-connected layer takes two concatinated vectors (outputs of conv3x3 and conv4x4) +We apply log-softmax to the output of fully-connected layer and +get the array of size 2: probabilities of positive and negative classes. + + + +# Dataset loader + +Implemented in IMDB_dataset() function - a Pytorch generator for IMDB dataset. + + + + +# Installation guide: + +## 1. Install conda: + +You have to install conda, +pip doesn't work correctly with torch on Linux right now. + +``` +wget "https://repo.anaconda.com/archive/Anaconda3-5.2.0-Linux-x86_64.sh" +bash Anaconda-latest-Linux-x86_64.sh +``` + +## 2. Create and activate virtual environment using conda: +``` +conda create -n torchenv_conda python=3.6 +source activate torchenv_conda +``` + +## 3. Install torch, torchvision and torchtext + +Without CUDA (CPU only): +``` +conda install pytorch-cpu torchvision-cpu -c pytorch +pip install cython +pip install msgpack +pip install torchtext +``` +With CUDA 9.0: +``` +conda create -n torchenv_conda_gpu_cuda_9 python=3.6 +source activate torchenv_conda_gpu_cuda_9 +conda install pytorch torchvision cuda90 -c pytorch +pip install cython +pip install msgpack +pip install torchtext +``` + +If you need to install pytorch for other version of CUDA: + +"Get started" section on the main page of https://pytorch.org/ + +## 4. Run the script: + +Remainder: you have to be loggined into the virtual enviroment you created. + +Run: +``` +python train.py +``` +It will work just on fine on CPU only. + +While trying to run the script on GPU, you will get an error "long tensor expected but got cuda.longTensor" + +It happens because nn.Embedding layer in pytorch doesn't accept long tensors: +https://pytorch.org/docs/master/nn.html#embedding +"Input: LongTensor of arbitrary shape containing the indices to extract" + +Similar problem: +https://github.com/pytorch/pytorch/issues/7236 + +So, you can just manually disable CUDA in main function (set flag use_cuda to False) + + +Ways to go over that problem: +1) Write your own embedding layer +2) Get rid of the embedding layer + +Getting rid of the embedding layer is a questionary move because our goal is to mimic the PaddlePaddle implementation already presented in the repo. +We can ask to change the configuration in both Pytorch and PaddlePaddle configurations close to the paper it was supposed to be close: +https://arxiv.org/abs/1412.1058 + +The full explanation can be found in the official repo: https://github.com/mlperf/reference/tree/master/sentiment_analysis + + diff --git a/sentiment_analysis/pytorch/train.py b/sentiment_analysis/pytorch/train.py new file mode 100644 index 000000000..2596da3bd --- /dev/null +++ b/sentiment_analysis/pytorch/train.py @@ -0,0 +1,209 @@ +import argparse +import warnings +from torchtext import data +from torchtext import datasets +import torch +import tqdm +import torch.nn as nn +from torch.autograd import Variable +import torch.nn.functional as F +import numpy as np + + +def IMDB_dataset(use_cuda=True, batch_size=128, max_len=2470): + """ + Pytorch generator for IMDB dataset. + Args: + use_cuda - bool + batch_size - int + max_len - int - max length of the sentence in train. + All smaller sentences will be padded to have length = max_len. + All larger sentences will be cropped. + Returns: + train_iter, test_iter - batch generators + len(TEXT.vocab) - vocabulary size. Necessary for the embedding layer + batch_size + """ + device = "cuda:0" if use_cuda else -1 + # set up fields + TEXT = data.Field(lower=True, fix_length=max_len, batch_first=True) + LABEL = data.Field(sequential=False) + # make splits for data + train, test = datasets.IMDB.splits(TEXT, LABEL) + # build the vocabulary + TEXT.build_vocab(train) + LABEL.build_vocab(train) + train_iter, test_iter = data.BucketIterator.splits( + (train, test), batch_size=batch_size, device=device) + return train_iter, test_iter, len(TEXT.vocab), batch_size + + +class ConvNet(nn.Module): + """ + Architecture: + Embedding layer with customizable vocab size and + embeding size, + For filter size 3 and 4: + 2d Convolutional layer, + 2d Max Pooling layer, + Fully-connected layer applied for + concatinaned outputs of two separated convotional + layers + """ + + def __init__(self, vocab_size, embedding_size=1024): + super().__init__() + self.vocab_size = vocab_size + self.embedding_size = embedding_size + self.conv_3 = nn.Sequential( + nn.Conv2d(1, 1, kernel_size=3), + nn.ELU(), + nn.MaxPool2d(kernel_size=3), + ) + self.conv_4 = nn.Sequential( + nn.Conv2d(1, 1, kernel_size=4), + nn.ELU(), + nn.MaxPool2d(kernel_size=4), + ) + self.fc = nn.Linear(436560, 2) + + def forward(self, sentences): + embed_layer = nn.Embedding( + self.vocab_size, + self.embedding_size, + sparse=True) + sentences_embedded = embed_layer(sentences) + sentences_embedded = sentences_embedded.unsqueeze(1) + out_3 = self.conv_3(sentences_embedded) + out_4 = self.conv_4(sentences_embedded) + out_3 = out_3.view(out_3.size(0), -1) + out_4 = out_4.view(out_4.size(0), -1) + out = torch.cat((out_3, out_4), 1) + out = self.fc(out) + return F.log_softmax(out, dim=-1) + + +def train( + model, + optimizer, + n_epochs, + train_iter, + test_iter, + vocab_size, + batch_size, + quality, + train_size): + """ + Performs training with n_epochs steps. + train_iter - torch iterator over the train set + test_test - torch iterator over the test set + vocab_size - int, number of unique words to be embedded + batch_size - int, size of minibatch + quality - float, accuracy to reach on test set + train_size - int, number of sentences in train set + """ + train_log, train_acc_log = [], [] + val_log, val_acc_log = [], [] + for epoch in range(n_epochs): + train_loss, train_acc = train_epoch(model, optimizer, train_iter) + val_loss, val_acc = test(model, test_iter, quality) + train_log.extend(train_loss) + train_acc_log.extend(train_acc) + steps = train_size // batch_size + val_log.append((steps * (epoch + 1), np.mean(val_loss))) + average_val_acc = np.mean(val_acc) + val_acc_log.append((steps * (epoch + 1), average_val_acc)) + print("Epoch =", epoch, + ", train-accuracy =", np.mean(train_acc), ", train-loss =", + np.mean(train_loss), ", validation-accuracy =", average_val_acc, + ", validation-loss =", np.mean(val_loss)) + if average_val_acc > target_val_acc: + break + + print("Final error: {:.2%}".format(1 - val_acc_log[-1][1])) + + +def train_epoch(model, optimizer, train_iter): + loss_log, acc_log = [], [] + model.train() + for batch in tqdm.tqdm(train_iter): + data = Variable(batch.text) + target = Variable(batch.label) - 1 + optimizer.zero_grad() + output = model(data) + pred = torch.max(output, 1)[1].data.numpy() + acc = np.mean(pred == target.data.numpy()) + acc_log.append(acc) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + loss = loss.data[0] + loss_log.append(loss) + return loss_log, acc_log + + +def test(model, test_iter, quality): + loss_log, acc_log = [], [] + model.eval() + for batch in tqdm.tqdm(test_iter): + data = Variable(batch.text) + target = Variable(batch.label) - 1 + output = model(data) + loss = F.nll_loss(output, target) + pred = torch.max(output, 1)[1].data.numpy() + acc = np.mean(pred == target.data.numpy()) + acc_log.append(acc) + loss = loss.data[0] + loss_log.append(loss) + return loss_log, acc_log + + +def main(use_cuda, seed, quality, embedding_size, train_size): + if use_cuda and not torch.cuda.is_available(): + warnings.warn( + "CUDA device is not accessible! Setting use_cuda to False.") + use_cuda = False + + train_iter, test_iter, vocab_size, batch_size = IMDB_dataset(use_cuda) + model = ConvNet(vocab_size, embedding_size) + optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001) + n_epochs = 3 + + train( + model, + optimizer, + n_epochs, + train_iter, + test_iter, + vocab_size, + batch_size, + quality, + train_size) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + + # Parse arguments + parser.add_argument('-m', '--model', nargs='?', required=False, + choices=['conv', 'lstm'], default='conv', + help="Model type for sentiment analysis") + parser.add_argument('-q', '--target_quality', type=float, required=False, + default=90.6, + help="Target validation quality to stop training") + parser.add_argument('-s', '--seed', type=int, required=False, default=1, + help="Seed for random number generator") + parser.add_argument('-e', '--embedding_size', type=int, required=False, + default=1024, help="Length of embedding vector") + + args = parser.parse_args() + + if args.model == 'conv': + main(use_cuda=True, # Runs on CPU if "False" + seed=args.seed, + quality=args.target_quality, + embedding_size=args.embedding_size, + train_size=25000) + else: + raise NotImplementedError