pytorch_models.py

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
import pickle
import json
import os
import argparse

# ### Define Constants

TRAINED_MODEL_PATH = 'trained_models'
CLASSIFICATION_RESULTS_PATH = 'prediction_results/classification'
REGRESSION_RESULTS_PATH = 'prediction_results/regression'
DATASET_FILE_PATH = 'dataset/cve_dataset.csv'
WORD_EMBEDDING_FILE_PATH = 'word_embeddings/word2vec_vectors.pickle'


# ## Models

# ### CNN Model

class CNNModel(nn.Module):
    """
    CNN text classification model.
    """

    def __init__(self, vocab_size, embedding_size=300, num_filters=128,
                 num_classes=4, weights=None):
        super(CNNModel, self).__init__()
        ""
        filter_sizes = [1, 3, 5]
        self.num_classes = num_classes

        # embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_size)

        if weights is not None:
            self.embedding.load_state_dict({'weight': weights})
            self.embedding.weight.requires_grad = False

        self.convs = nn.ModuleList(
            [nn.Conv1d(embedding_size, num_filters, k) for k in filter_sizes]
        )

        self.bn = nn.BatchNorm1d(num_filters * len(filter_sizes))
        self.fc = nn.Linear(num_filters * len(filter_sizes), num_classes)

    def conv_and_max_pool(self, x, conv):
        """Convolution and global max pooling layer"""
        return F.relu(conv(x).permute(0, 2, 1)).max(1)[0]

    def forward(self, x):
        # (batch, c, seq_len) -> (batch, seq_len, c)
        out = self.embedding(x).permute(0, 2, 1)
        out = [self.conv_and_max_pool(out, k) for k in self.convs]
        out = torch.cat(out, 1)
        out = self.bn(out)
        out = self.fc(out)

        return out


# ### LSTM Model

class LSTMModel(nn.Module):
    """
    LSTM text classification model.
    """

    def __init__(self, vocab_size, embedding_size=300, hidden_units=128,
                 num_classes=4, weights=None):
        super(LSTMModel, self).__init__()

        self.num_classes = num_classes
        self.hidden_units = hidden_units
        self.num_layers = 1

        # embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_size)

        if weights is not None:
            self.embedding.load_state_dict({'weight': weights})
            self.embedding.weight.requires_grad = False

        self.lstm = nn.LSTM(embedding_size, hidden_units,
                            num_layers=self.num_layers, batch_first=True)
        self.fc = nn.Linear(3 * hidden_units, num_classes)

    def forward(self, x):
        out = self.embedding(x)

        h_state = (
            torch.zeros(self.num_layers, out.size(0), self.hidden_units).to(
                device),
            torch.zeros(self.num_layers, out.size(0), self.hidden_units).to(
                device))

        lstm_out, (hidden_state, cell_state) = self.lstm(out, h_state)
        # lstm_out shape => (batch_size, seq_len, hidden_size)

        ## concat pooling
        last_state = lstm_out[:, -1]  # (batch_size, n_hidden)
        avg_pool = lstm_out.mean(1)
        max_pool = lstm_out.max(1)[0]

        out = torch.cat([last_state, avg_pool, max_pool], 1)

        out = self.fc(out)

        return out


# ### Data Loader class

class TextDataLoader(Dataset):
    """Text data loader for batches"""

    def __init__(self, x, y, max_len=1000):
        self.max_len = max_len
        self.x = x
        self.y = y

    def __getitem__(self, i):
        return self.__pad(self.x[i]), self.y[i]

    def __len__(self):
        return len(self.x)

    def __pad(self, x):
        if len(x) < self.max_len:
            return np.append(x.astype(np.int32),
                             np.array([0] * (self.max_len - len(x))).astype(
                                 np.int32))
        else:
            return x[:self.max_len].astype(np.int32)


class TrainingManager:
    def __init__(self, task, device, use_pretrained, **kwargs):
        self.task = task
        self.device = device
        self.use_pretrained = use_pretrained
        self.learning_rate = kwargs.get('learning_rate', 0.001)
        self.batch_size = kwargs.get('batch_size', 32)
        self.label_key = "class" if task == "classification" \
            else 'cvssV2_baseScore'
        self.batch_print_every = 200

    @staticmethod
    def read_dataset():
        """Reads cve dataset and returns its DataFrame
        """
        try:
            cve_dataset = pd.read_csv(DATASET_FILE_PATH)
            cve_dataset = cve_dataset[
                ['cve_id', 'description', 'description_cleaned',
                 'cvssV2_baseScore', 'class']]
            return cve_dataset
        except Exception as e:
            print(e)

    @staticmethod
    def read_word_embeddings():
        """ Reads word embeddings and return a dictionary """
        try:
            with open(WORD_EMBEDDING_FILE_PATH, 'rb') as f:
                word2vec_vectors = pickle.load(f)

                return word2vec_vectors
        except Exception as e:
            print(e)

    @staticmethod
    def build_vocab(cve_dataset):
        """Builds vocabulary from entire cve_dataset
        Args:
            cve_dataset: pandas DataFrame of cve entries
        Returns:
            words: set of words
            word_id_matching: dictionary of word - id matchings
        """
        word_counts = defaultdict(int)
        for description in cve_dataset['description_cleaned']:
            _words = description.split()
            for word in _words:
                word_counts[word] += 1

        words = [k for k, v in word_counts.items() if v > 1]
        word_id_matching = dict(zip(words, range(len(words))))
        return words, word_id_matching

    def convert_sentences_to_id_list(self, sentences, word_id_matching):
        """Converts sentences to word-id's list instead of words
        Args:
            sentences (list): List of sentences
            word_id_matching (dict): Dictionary of word-id matching
        Returns:
            arr (list): List of sentences where each sentence is
                        a word-id list
        """
        arr = []
        for sentence in sentences:
            arr.append(self.convert_words_to_ids(sentence, word_id_matching))

        return np.array(arr)

    @staticmethod
    def convert_words_to_ids(word_list, word_id_matching):
        """Converts words  in a given word list to word-ids
        Args:
            word_list (list): List of words
            word_id_matching (dict): Dictionary of word-id matching
        Returns:
            words: numpy array of word ids
        """
        words = []
        for word in word_list:
            if word in word_id_matching:
                words.append(word_id_matching[word])
        return np.array(words).astype(np.int32)

    def get_vectors_for_dataset(self, dataset, word_id_matching):
        """ Returns word ids in sentences for a given dataset.
        Args:
            dataset: train, test or validation dataset
            word_id_matching (dict): Dictionary of word-id matching
        Returns:
            X: Mean of word embedding vectors in the description sentence
            y: label value
        """

        sentences = dataset['description_cleaned'].str.split().values
        X = self.convert_sentences_to_id_list(sentences, word_id_matching)
        y = dataset[self.label_key].values

        return X, y

    @staticmethod
    def train_validation_test_split(cve_dataset):
        """ Splits cve_dataset into train, validation and test set.
        Args:
            cve_dataset

        Returns:
            train_set
            val_set
            test_set
        """

        train_set, test_set = train_test_split(cve_dataset, test_size=0.2,
                                               random_state=1773,
                                               stratify=cve_dataset['class'])
        train_set, val_set = train_test_split(train_set, test_size=0.125,
                                              random_state=1773,
                                              stratify=train_set['class'])

        return train_set, val_set, test_set

    @staticmethod
    def get_word2vec_weights_matrix(word_id_matching, word2vec_vectors,
                                    embedding_dim=300):
        """Builds a matrix of word2vec weights with size (vocab_size x embedding_dim)
           Each word in the vocabulary is represented as word2vec vectors
        Args:
            word_id_matching (dict): Dictionary of word-id matching
            word2vec_vectors (dict): Dictionary of word-word2vec_vector matching
            embedding_dim (int): Dimensionality of embedding
        Returns:
            weights_matrix (np.array): array with size (vocab_size x embedding_dim)
                each row represents a word embedding for the word having corresponding
                id in word_id_matching
        """
        matrix_len = len(word_id_matching.keys())
        weights_matrix = np.zeros((matrix_len, embedding_dim))
        non_found_words = []

        for word, ix in word_id_matching.items():
            try:
                weights_matrix[ix] = word2vec_vectors[word]
            except KeyError:
                weights_matrix[ix] = np.random.normal(0, 1,
                                                      size=(embedding_dim,))
                non_found_words.append(word)

        return torch.from_numpy(weights_matrix)

    def get_dataloaders(self, train_set, val_set, test_set, word_id_matching):
        """Obtains input vectors (word ids of sentences) for train-val-test
           sets and generates input and target values. Then, constructs
           input batch loaders.
        Args:
            train_set: pandas DataFrame of training set
            val_set: pandas DataFrame of validation set
            test_set: pandas DataFrame of test set
            word_id_matching (dict): Dictionary of word-id matching
        Returns:
            train_loader: training set loader
            val_loader: validation set loader
            test_loader: test set loader
        """

        x_train, y_train = self.get_vectors_for_dataset(train_set,
                                                        word_id_matching)
        x_val, y_val = self.get_vectors_for_dataset(val_set,
                                                    word_id_matching)
        x_test, y_test = self.get_vectors_for_dataset(test_set,
                                                      word_id_matching)

        max_length = np.max(
            [len(x) for x in np.concatenate((x_train, x_test, x_val), axis=0)])

        dataset_train = TextDataLoader(x_train, y_train, max_len=max_length)
        dataset_test = TextDataLoader(x_test, y_test, max_len=max_length)
        dataset_val = TextDataLoader(x_val, y_val, max_len=max_length)

        train_loader = DataLoader(dataset_train, batch_size=self.batch_size,
                                  num_workers=0, shuffle=True)
        val_loader = DataLoader(dataset_val, batch_size=self.batch_size,
                                num_workers=0, shuffle=False)
        test_loader = DataLoader(dataset_test, batch_size=self.batch_size,
                                 num_workers=0, shuffle=False)

        return train_loader, val_loader, test_loader

    def train(self, model, criteria, optimizer, data_loader):
        """Runs training phase of given model
        Args:
            model: Pytorch model
            criteria: Loss criteria
            optimizer: Optimizer object
            data_loader: Input data loader
        Returns:
            epoch_loss: Loss value calculated in this epoch
            epoch_acc: Accuracy calculated in this epoch
        """
        model.train()
        total_corrects = 0
        total_loss = 0
        total_seen = 0

        for i, data in enumerate(data_loader):
            text, label = data
            text = text.to(device).long()
            label = label.to(device)

            model.zero_grad()

            if self.task == "classification":
                label = label.long()
                output = model(text)
                _, predicted = torch.max(output.data, 1)
            else:
                label = label.float()
                output = model(text).squeeze()
                predicted = output

            loss = criteria(output, label)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            if self.task == "classification":
                total_seen += label.size(0)
                total_corrects += (predicted == label).sum().item()

                if i % self.batch_print_every == 0:
                    avg_acc = total_corrects / total_seen
                    avg_loss = total_loss / (i + 1)

                    progress = [
                        "{}/{}".format(i, len(data_loader)),
                        "{:.3f}".format(avg_loss),
                        "{:.3f}".format(avg_acc)
                    ]

                    print("Batch {} avg loss: {}, avg acc: {}" \
                          .format(*progress))

            else:
                if i % self.batch_print_every == 0:
                    avg_loss = total_loss / (i + 1)

                    print(
                        "Batch {}/{} avg loss: {:.3f}".format(i,
                                                              len(data_loader),
                                                              avg_loss))

        epoch_loss = total_loss / len(data_loader)
        epoch_acc = total_corrects / total_seen \
            if self.task == "classification" else None

        return epoch_loss, epoch_acc

    def test(self, model, criteria, data_loader):
        """Runs training phase of given model
        Args:
            model: Pytorch model
            criteria: Loss criteria
            data_loader: Input data loader
        Returns:
            y_pred: Predicted values
            epoch_loss: Loss value calculated in this epoch
            epoch_acc: Accuracy calculated in this epoch
        """
        model.eval()
        total_corrects = 0
        total_loss = 0
        total_seen = 0

        y_pred = []

        with torch.no_grad():
            for i, data in enumerate(data_loader):
                text, label = data
                text = text.to(device).long()
                label = label.to(device)

                if self.task == "classification":
                    label = label.long()
                    output = model(text)
                    _, predicted = torch.max(output.data, 1)
                else:
                    label = label.float()
                    output = model(text).squeeze()
                    predicted = output

                loss = criteria(output, label)
                total_loss += loss.item()

                if self.task == "classification":
                    total_seen += label.size(0)
                    total_corrects += (predicted == label).sum().item()

                y_pred.extend(predicted.to('cpu').numpy())

        epoch_loss = total_loss / len(data_loader)
        epoch_acc = total_corrects / total_seen \
            if self.task == "classification" else None

        return y_pred, epoch_loss, epoch_acc

    @staticmethod
    def classification_report(preds):
        """Evaluates predictions and generates a report for classification
        Args:
            preds: DataFrame of predictions having actual and predicted values
        """

        acc = preds.loc[preds.actual == preds.pred].shape[0] / preds.shape[0]

        clf_results = np.array(
            precision_recall_fscore_support(preds["actual"],
                                            preds["pred"]))

        results_by_class = pd.DataFrame(clf_results.T,
                                        columns=['precision', 'recall',
                                                 'f1-score', 'support'],
                                        index=[0, 1, 2, 3]
                                        )

        avg_precision = np.sum(clf_results[0] * clf_results[3]) / np.sum(
            clf_results[3])
        avg_recall = np.sum(clf_results[1] * clf_results[3]) / np.sum(
            clf_results[3])
        f1_score = np.sum(clf_results[2] * clf_results[3]) / np.sum(
            clf_results[3])

        scores = {
            'acc': round(acc, 3),
            'avg_precision': round(avg_precision, 3),
            'avg_recall': round(avg_recall, 3),
            'f1_score': round(f1_score, 3)
        }

        conf_matrix = pd.DataFrame(
            confusion_matrix(preds["actual"], preds["pred"]),
            columns=[0, 1, 2, 3],
            index=[0, 1, 2, 3])

        print('Confusion Matrix: \n')
        print(conf_matrix)
        print('\nPrecision, Recall, F1-Scores for Classes\n')
        print(results_by_class)
        print('\nScores\n')
        print(scores)

        return preds, scores, results_by_class, conf_matrix

    @staticmethod
    def regression_report(preds):
        """Evaluates predictions and generates a report for classification
        Args:
            preds: DataFrame of predictions having actual and predicted values
        Returns:
            predictions: predictions itself
            scores: mae, mmre, mdmre, rmse, mape scores
        """

        preds['abs_err'] = np.abs(
            preds['actual'] - preds['pred'])
        preds['rel_err'] = preds['abs_err'] / preds['actual']
        preds['squared_error'] = (preds['actual'] - preds[
            'pred']) ** 2

        mae = preds['abs_err'].mean()
        mmre = preds['rel_err'].mean()
        mdmre = preds['rel_err'].median()
        rmse = np.sqrt(preds['squared_error'].mean())
        mape = mmre * 100

        scores = {
            'mae': round(mae, 3),
            'mmre': round(mmre, 3),
            'mdmre': round(mdmre, 3),
            'rmse': round(rmse, 3),
            'mape': round(mape, 3)
        }

        print('\nScores\n')
        print(scores)

        return preds, scores, None, None

    def save_predictions(self, preds, results_by_class, conf_matrix, scores,
                         file_name):
        """ Saves predictions """
        out_file_prefix = CLASSIFICATION_RESULTS_PATH + "/" + file_name

        preds.to_csv(out_file_prefix + '_predictions.csv', index=True)

        if self.task == "classification":
            conf_matrix.to_csv(out_file_prefix + '_conf_matrix.csv',
                               index=True)
            results_by_class.to_csv(out_file_prefix + "_class_results.csv",
                                    index=True)

        with open(out_file_prefix + "_scores.json", 'w') as f:
            json.dump(scores, f)

    @staticmethod
    def get_pretrained_model(model_name, file_extension=".pth"):
        """Looks-up and finds pretrained model for given task
        Args:
            model_name (str): Name of model
        Returns:
            model_state: pre_trained model state
        """
        model_file = None
        model_state = None

        for file in os.listdir(TRAINED_MODEL_PATH):
            if file.endswith(
                    file_extension) and model_name + file_extension == file:
                model_file = TRAINED_MODEL_PATH + "/" + file
                break

        if model_file is not None:
            model_state = torch.load(model_file, map_location='cpu')

        return model_state

    def get_model_definitions(self, vocab_size, weights_matrix):
        """Creates CNN and LSTM models and their optimizers
        Args:
            vocab_size (int): Number of words in the vocabulary
            weights_matrix: word2vec weights matrix
        Returns:
            model_definitions: Model definitions for each model
        """

        num_classes = 1 if self.task == "regression" else 4

        cnn_model_name = "CNN_With_Word2vec_Weights"
        cnn_model_name += "_Reg" if self.task == "regression" else ""

        lstm_model_name = "LSTM_With_Word2vec_Weights"
        lstm_model_name += "_Reg" if self.task == "regression" else ""

        cnn_model = CNNModel(vocab_size=vocab_size,
                             weights=weights_matrix,
                             num_classes=num_classes).to(device)

        optimizer_1 = optim.Adam(cnn_model.parameters(),
                                 lr=self.learning_rate,
                                 weight_decay=0.0001)

        lstm_model = LSTMModel(vocab_size=vocab_size,
                               weights=weights_matrix,
                               num_classes=num_classes).to(device)

        optimizer_2 = optim.Adam(lstm_model.parameters(),
                                 lr=self.learning_rate,
                                 weight_decay=0.0001)

        model_definitions = [
            {
                'model_name': cnn_model_name,
                'model': cnn_model,
                'optimizer': optimizer_1,
                'n_epochs': 3
            },
            {
                'model_name': lstm_model_name,
                'model': lstm_model,
                'optimizer': optimizer_2,
                'n_epochs': 3
            }
        ]

        return model_definitions

    def run_training_task(self, model_definition, criteria,
                          train_loader, val_loader):
        """Runs training task for a given model
        Args:
            model_definition: Model definition which is going to be trained
            criteria: Loss criteria
            train_loader: train data loader
            val_loader: validation data loader
        Returns:
            model_definition
        """
        model_name = model_definition['model_name']
        model = model_definition['model']
        optimizer = model_definition['optimizer']
        n_epochs = model_definition['n_epochs']

        print('Training {}'.format(model_name))

        for epoch in range(n_epochs):
            train_loss, train_acc = self.train(model, criteria, optimizer,
                                               train_loader)

            train_progress = "[{}/{}]: Train loss: {:.3f}".format(epoch + 1,
                                                                  n_epochs,
                                                                  train_loss)

            if self.task == "classification":
                train_progress += ", acc: {:.3f}".format(train_acc)

            print()
            print(train_progress)

            _, val_loss, val_acc = self.test(model, criteria, val_loader)

            val_progress = "[{}/{}]: Val loss: {:.3f}".format(epoch + 1,
                                                              n_epochs,
                                                              val_loss)

            if self.task == "classification":
                val_progress += ", acc: {:.3f}".format(val_acc)

            print(val_progress)

        model_definition['model'] = model

        return model_definition

    def run_prediction_task(self, model_definition, criteria,
                            test_loader, test_set):
        """Runs prediction task for a given model
        Args:
            model_definition: Model definition which is going to be tested
            criteria: Loss criteria
            test_loader: test data loader
            test_set: pandas DataFrame of test set
        Returns:
            classification report
        """
        model_name = model_definition['model_name']
        model = model_definition['model']

        print('\nTesting {}'.format(model_name))

        y_pred, test_loss, test_acc = self.test(model, criteria, test_loader)

        if self.task == "classification":
            print("Test loss: {:.3f}, acc: {:.3f}".format(test_loss, test_acc))
        else:
            print("Test loss: {:.3f}".format(test_loss))

        y_test = test_set[self.label_key]

        predictions = pd.DataFrame(np.stack((y_test, y_pred)).T,
                                   columns=["actual", "pred"],
                                   index=test_set["cve_id"])

        print("\n Test Results for {} \n".format(model_name))

        if self.task == "classification":
            return self.classification_report(predictions)
        else:
            return self.regression_report(predictions)

    def run_experiment(self):
        # Step 0: Pre-Training Step
        # create output folders if they dont exist
        output_folders = [TRAINED_MODEL_PATH, CLASSIFICATION_RESULTS_PATH,
                          REGRESSION_RESULTS_PATH]
        for out_folder in output_folders:
            if not os.path.exists(out_folder):
                os.makedirs(out_folder)

        # Step 1: Read Dataset
        cve_dataset = self.read_dataset()

        # Step 2: Read Pre-trained word embeddings
        word_embeddings = self.read_word_embeddings()

        # Step 3: Build Vocabulary
        vocab, word_id_matching = self.build_vocab(cve_dataset)

        # Step 4: Split cve_dataset to train-validation-test
        train_set, val_set, test_set = self.train_validation_test_split(
            cve_dataset)

        # Step 5: Create DataLoaders
        train_loader, val_loader, test_loader = self.get_dataloaders(
            train_set,
            val_set,
            test_set,
            word_id_matching
        )

        # Step 6: Build word2vec weights matrix
        weights_matrix = self.get_word2vec_weights_matrix(word_id_matching,
                                                          word_embeddings)

        # Step 7: Define Loss Criteria
        criteria = nn.CrossEntropyLoss() if self.task == "classification" \
            else nn.MSELoss()

        criteria = criteria.to(device)

        # Step 8: Training task

        # Step 8.1: Get Model Definitions
        model_definitions = self.get_model_definitions(len(vocab),
                                                       weights_matrix)

        for model_def in model_definitions:
            model_name = model_def['model_name']

            # Step 8.2: Try to get pretrained model

            model_state = self.get_pretrained_model(model_name)

            # Step 8.3: Run Training Task
            if self.use_pretrained is False or model_state is None:
                # Do the training
                model_def = self.run_training_task(
                    model_def, criteria, train_loader, val_loader
                )

                # Save the model
                torch.save(model_def['model'].state_dict(),
                           TRAINED_MODEL_PATH + "/" + model_name + ".pth")
            else:
                # Skip to prediction
                model = model_def['model']
                model.load_state_dict(model_state)
                model.eval()
                model_def['model'] = model
                print('Found a pretrained model for {}'.format(model_name))
                print('Skipping to prediction step.')

            # Step 8.4: Prediction

            print('\nPrediction results for test set\n')
            predictions, scores, results_by_class, conf_matrix = \
                self.run_prediction_task(
                    model_def,
                    criteria,
                    test_loader,
                    test_set
                )

            print()

            # Step 8.5: Save Final predictions
            self.save_predictions(
                predictions, results_by_class, conf_matrix, scores, model_name
            )


def str2bool(v):
    """Converts a candidate string to bool
    Args:
        v (str)
    Returns:
        True or False according to input

    Source: https://stackoverflow.com/a/43357954
    """
    if v.lower() in ('yes', 'True', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'False', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')


if __name__ == "__main__":
    # Define Options
    # Read options from command line
    parser = argparse.ArgumentParser()
    parser.add_argument('--use_pretrained', default=True, type=str2bool,
                        help="""True for using pretrained models, 
                                False otherwise. Default: True""")
    parser.add_argument('--task', default="classification",
                        help="""classification or regression. 
                                Default: classification""")
    parser.add_argument('--use_gpu', default=True, type=str2bool,
                        help="""True for using GPU. False otherwise.
                                Default: True""")

    args = parser.parse_args()

    use_pretrained = args.use_pretrained
    task = args.task
    use_gpu = args.use_gpu

    device = torch.device(
        'cuda' if torch.cuda.is_available() and use_gpu else 'cpu')

    training_manager = TrainingManager(task, device, use_pretrained)
    training_manager.run_experiment()