lstm_keras_stateless_no_ttl.py

import sys, os
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM, Flatten
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from random import random
from sklearn.metrics import roc_curve, auc
from graph_tool.all import *

import prep_time_series_input
import scenario_info
import create_graph
from metrics import *
"""
Same as stateless LSTM except it only uses the initial 22 features
(the graph-based features + "function" features excluding the 6 TTL features)
(includes a function that removes the other features from the x vector)
"""

# Disable print statements
def blockPrint():
    sys.stdout = open(os.devnull, 'w')

# Enable print stements
def enablePrint():
    sys.stdout = sys.__stdout__

'''
Trains the model
Parameters:
x_train - NumPy array for x training set
y_train - NumPy array for y training set
pcap_duration - pcap duration (seconds) - available on CTU website
step_length - step duration (seconds)
save_model - True if model is saved in an h5 file
savefile - name of file that the model is saved to
'''
def create_model(x_train, y_train, pcap_duration, step_length, \
    save_model=True, savefile="model.h5"):
    print "Starting the creation of the model"
    model = Sequential()
    # Input arrays of shape (num_vertices, 12) and
    # output arrays of shape (num_vertices, 1)
    # len(x_train) = number of samples/vertices
    # len(x_train[0]) = number of time_steps/graphs,
    # len(x_train[0][0]) = number of features
    '''
    # Adding batch size screws up the program since it has to match batch
    # size later...it's necessary for stateful LSTM but not for stateless
    model.add(LSTM(32, batch_input_shape=(len(x_train), len(x_train[0]), \
        len(x_train[0][0])), return_sequences=True, stateful=False))
    '''
    # Dropout: Randomly set half (arbitrarily fraction) of the input units
    # to 0 at each update during training, which helps prevent overfitting.
    # Perhaps lower the rate if accuracy on the training or validation set
    # is low and increase if training set worked well but test set did not
    
    # One layer:
    model.add(LSTM(64, input_shape=(len(x_train[0]), \
        len(x_train[0][0])), return_sequences=True, stateful=False))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    
    """
    # Two layers:
    model.add(LSTM(64, input_shape=(len(x_train[0]), \
        len(x_train[0][0])), return_sequences=True, stateful=False))
    model.add(Dropout(0.5))
    model.add(LSTM(64))
    model.add(Dense(1, activation='sigmoid'))
    """
    """
    # Three layers:
    model.add(LSTM(64, input_shape=(len(x_train[0]), \
        len(x_train[0][0])), return_sequences=True, stateful=False))
    model.add(Dropout(0.5))
    model.add(LSTM(64, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    """

    model.compile(optimizer='rmsprop', loss='mean_squared_error', \
        metrics=['accuracy', true_positives, true_negatives, \
        false_positives, false_negatives, true_positive_rate, \
        true_negative_rate, false_positive_rate, false_negative_rate])
    model.fit(x_train, y_train, epochs=200, \
        batch_size=int(pcap_duration/(step_length * 2)), shuffle = False)

    if save_model == True:
        try:
            model.save(savefile)
            print "Saved model as " + str(savefile)
        except:
            print "Couldn't save the model"
    return model

'''
Evaluates the model given x_test and y_test
Parameters:
model - model generated by create_model or loaded from h5 file
x_test - NumPy array for x test set
y_test - NumPy array for y test set
pcap_duration - pcap duration (seconds) - available on CTU website
step_length - step duration (seconds)
'''
def evaluate_model(model, x_test, y_test, pcap_duration, step_length):
    score = model.evaluate(x_test, y_test, \
        batch_size=int(pcap_duration/(step_length * 2)))
    loss, accuracy, true_positives, true_negatives, false_positives, \
        false_negatives, true_positive_rate, true_negative_rate, \
        false_positive_rate, false_negative_rate = score
    print "\n"
    print "Loss: " + str(loss)
    print "Accuracy: " + str(accuracy * 100) + "%"
    print "True positives: " + str(true_positives)
    print "True positive rate: " + str(true_positive_rate * 100) + "%"
    print "True negatives: " + str(true_negatives)
    print "True negative rate: " + str(true_negative_rate * 100) + "%"
    print "False positives: " + str(false_positives)
    print "False positive rate: " + str(false_positive_rate * 100) + "%"
    print "False negatives: " + str(false_negatives)
    print "False negative rate: " + str(false_negative_rate * 100) + "%"

'''
Displays the Receiver Operator Characteristic (ROC) curve with the area
under its curve given the parameter model and x and y data arrays
'''
def generate_roc_curve(model, x_test, y_test, data_scenario, model_scenario, savefile=None):
    # Get array of probabilities of that the y result is a 1
    y_score = model.predict_proba(x_test)
    # Compute ROC curve and ROC area for each class
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange',
             lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic of scenario ' \
        + str(model_scenario) + '\'s model on scenario ' \
        + str(data_scenario) + '\'s data')
    plt.legend(loc="lower right")
    if savefile != None:
        plt.savefig(savefile)
    # plt.show()

def main():
    step_length = 150
    interval_length = 300
    
    model_scenario = int(sys.argv[3])
    data_scenario = int(sys.argv[4])

    # pcap_file = sys.argv[1]
    # Dictionary of malicious IP addresses with start timestamp as its value
    botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
    pcap_duration = scenario_info.get_pcap_duration(data_scenario) # * 0.1

    savefile_x = sys.argv[1] # 'x_scenario_' + str(data_scenario) + '_lstm.txt'
    savefile_y = sys.argv[2] # 'y_scenario_' + str(data_scenario) + '_lstm.txt'
    model_savefile = 'stateless_lstm_22_features_model_scenario_' + str(model_scenario) \
        + '_interval_' + str(interval_length) + '_step_' + str(step_length) + '.h5'

    '''
    x, y = prep_time_series_input.generate_input_arrays(pcap_file, \
        botnet_nodes, pcap_duration, step_length = step_length, \
        interval_length = interval_length, do_save=True, \
        savefile_x=savefile_x, savefile_y=savefile_y, verbose = True)
    '''
    x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x, \
        filename_y=savefile_y)
    x = np.delete(x, np.s_[22:], 2) # wow I was initially writing tens of lines to do this

    balanced_x, balanced_y = prep_time_series_input.balance_data(x, y, ratio = 10)
    del x
    del y
    windowed_x, windowed_y, num_samples, windows_per_sample = prep_time_series_input.time_window_data \
        (balanced_x, balanced_y, 5, 2, interval_length, step_length, data_scenario)
    # Note that the test set contains all the data so obviously it includes the
    # training data...since the training data is so limited, it likely will have
    # little effect on the outcome though
    '''
    _, _, x_test, y_test = separate_into_sets(x, y, training_proportion = 0)
    x_train, y_train, _, _ = \
        separate_into_sets(balanced_x, balanced_y, training_proportion = 0.7)
    '''
    x_train, y_train, x_test, y_test = prep_time_series_input. \
        separate_into_sets(windowed_x, windowed_y, positive_proportion = 0.7)
    
    print "Number of samples (training and testing): ", str(num_samples)
    print "Number of windows per sample (training and testing): ", str(windows_per_sample)
    print "x_train, y_train shapes: ", x_train.shape, y_train.shape
    print "x_test, y_test shapes: ", x_test.shape, y_test.shape

    weighted_y_train = np.copy(y_train)
    weighted_y_train[weighted_y_train == 1] = 6
    weighted_y_test = np.copy(y_test)
    weighted_y_test[weighted_y_test == 1] = 6
    # TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION
    # (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER   

    '''
    ADD K-FOLD CROSS VALIDATION SOON...NOT NECESSARY RIGHT NOW FOR TESTING PURPOSES
    BUT DEFINITELY SHOULD DO IT FOR THE FINAL EVALUATION OF THE MODEL.
    https://machinelearningmastery.com/evaluate-performance-deep-learning-models-keras/
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html
    '''

    model = create_model(x_train, weighted_y_train, pcap_duration, step_length, \
        save_model=True, savefile=model_savefile)
    
    """
    model = load_model(model_savefile, custom_objects = \
        {'true_positives': true_positives, 'false_positives': false_positives, \
         'true_negatives': true_negatives, 'false_negatives': false_negatives, \
         'true_positive_rate': true_positive_rate, \
         'false_positive_rate': false_positive_rate, \
         'true_negative_rate': true_negative_rate, \
         'false_negative_rate': false_negative_rate})
    """
    evaluate_model(model, x_test, y_test, pcap_duration, step_length)
    generate_roc_curve(model, x_test, y_test, data_scenario, model_scenario, \
        savefile = 'stateless_lstm_22_features_model_scenario_' + str(model_scenario) \
        + '_data_scenario_' + str(data_scenario) + '_interval_' \
        + str(interval_length) + '_step_' + str(step_length) + '.png')

main()