-
Notifications
You must be signed in to change notification settings - Fork 3
/
lstm_keras_stateless_no_ttl.py
233 lines (211 loc) · 9.63 KB
/
lstm_keras_stateless_no_ttl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import sys, os
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM, Flatten
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from random import random
from sklearn.metrics import roc_curve, auc
from graph_tool.all import *
import prep_time_series_input
import scenario_info
import create_graph
from metrics import *
"""
Same as stateless LSTM except it only uses the initial 22 features
(the graph-based features + "function" features excluding the 6 TTL features)
(includes a function that removes the other features from the x vector)
"""
# Disable print statements
def blockPrint():
sys.stdout = open(os.devnull, 'w')
# Enable print stements
def enablePrint():
sys.stdout = sys.__stdout__
'''
Trains the model
Parameters:
x_train - NumPy array for x training set
y_train - NumPy array for y training set
pcap_duration - pcap duration (seconds) - available on CTU website
step_length - step duration (seconds)
save_model - True if model is saved in an h5 file
savefile - name of file that the model is saved to
'''
def create_model(x_train, y_train, pcap_duration, step_length, \
save_model=True, savefile="model.h5"):
print "Starting the creation of the model"
model = Sequential()
# Input arrays of shape (num_vertices, 12) and
# output arrays of shape (num_vertices, 1)
# len(x_train) = number of samples/vertices
# len(x_train[0]) = number of time_steps/graphs,
# len(x_train[0][0]) = number of features
'''
# Adding batch size screws up the program since it has to match batch
# size later...it's necessary for stateful LSTM but not for stateless
model.add(LSTM(32, batch_input_shape=(len(x_train), len(x_train[0]), \
len(x_train[0][0])), return_sequences=True, stateful=False))
'''
# Dropout: Randomly set half (arbitrarily fraction) of the input units
# to 0 at each update during training, which helps prevent overfitting.
# Perhaps lower the rate if accuracy on the training or validation set
# is low and increase if training set worked well but test set did not
# One layer:
model.add(LSTM(64, input_shape=(len(x_train[0]), \
len(x_train[0][0])), return_sequences=True, stateful=False))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
"""
# Two layers:
model.add(LSTM(64, input_shape=(len(x_train[0]), \
len(x_train[0][0])), return_sequences=True, stateful=False))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))
"""
"""
# Three layers:
model.add(LSTM(64, input_shape=(len(x_train[0]), \
len(x_train[0][0])), return_sequences=True, stateful=False))
model.add(Dropout(0.5))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
"""
model.compile(optimizer='rmsprop', loss='mean_squared_error', \
metrics=['accuracy', true_positives, true_negatives, \
false_positives, false_negatives, true_positive_rate, \
true_negative_rate, false_positive_rate, false_negative_rate])
model.fit(x_train, y_train, epochs=200, \
batch_size=int(pcap_duration/(step_length * 2)), shuffle = False)
if save_model == True:
try:
model.save(savefile)
print "Saved model as " + str(savefile)
except:
print "Couldn't save the model"
return model
'''
Evaluates the model given x_test and y_test
Parameters:
model - model generated by create_model or loaded from h5 file
x_test - NumPy array for x test set
y_test - NumPy array for y test set
pcap_duration - pcap duration (seconds) - available on CTU website
step_length - step duration (seconds)
'''
def evaluate_model(model, x_test, y_test, pcap_duration, step_length):
score = model.evaluate(x_test, y_test, \
batch_size=int(pcap_duration/(step_length * 2)))
loss, accuracy, true_positives, true_negatives, false_positives, \
false_negatives, true_positive_rate, true_negative_rate, \
false_positive_rate, false_negative_rate = score
print "\n"
print "Loss: " + str(loss)
print "Accuracy: " + str(accuracy * 100) + "%"
print "True positives: " + str(true_positives)
print "True positive rate: " + str(true_positive_rate * 100) + "%"
print "True negatives: " + str(true_negatives)
print "True negative rate: " + str(true_negative_rate * 100) + "%"
print "False positives: " + str(false_positives)
print "False positive rate: " + str(false_positive_rate * 100) + "%"
print "False negatives: " + str(false_negatives)
print "False negative rate: " + str(false_negative_rate * 100) + "%"
'''
Displays the Receiver Operator Characteristic (ROC) curve with the area
under its curve given the parameter model and x and y data arrays
'''
def generate_roc_curve(model, x_test, y_test, data_scenario, model_scenario, savefile=None):
# Get array of probabilities of that the y result is a 1
y_score = model.predict_proba(x_test)
# Compute ROC curve and ROC area for each class
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange',
lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic of scenario ' \
+ str(model_scenario) + '\'s model on scenario ' \
+ str(data_scenario) + '\'s data')
plt.legend(loc="lower right")
if savefile != None:
plt.savefig(savefile)
# plt.show()
def main():
step_length = 150
interval_length = 300
model_scenario = int(sys.argv[3])
data_scenario = int(sys.argv[4])
# pcap_file = sys.argv[1]
# Dictionary of malicious IP addresses with start timestamp as its value
botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
pcap_duration = scenario_info.get_pcap_duration(data_scenario) # * 0.1
savefile_x = sys.argv[1] # 'x_scenario_' + str(data_scenario) + '_lstm.txt'
savefile_y = sys.argv[2] # 'y_scenario_' + str(data_scenario) + '_lstm.txt'
model_savefile = 'stateless_lstm_22_features_model_scenario_' + str(model_scenario) \
+ '_interval_' + str(interval_length) + '_step_' + str(step_length) + '.h5'
'''
x, y = prep_time_series_input.generate_input_arrays(pcap_file, \
botnet_nodes, pcap_duration, step_length = step_length, \
interval_length = interval_length, do_save=True, \
savefile_x=savefile_x, savefile_y=savefile_y, verbose = True)
'''
x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x, \
filename_y=savefile_y)
x = np.delete(x, np.s_[22:], 2) # wow I was initially writing tens of lines to do this
balanced_x, balanced_y = prep_time_series_input.balance_data(x, y, ratio = 10)
del x
del y
windowed_x, windowed_y, num_samples, windows_per_sample = prep_time_series_input.time_window_data \
(balanced_x, balanced_y, 5, 2, interval_length, step_length, data_scenario)
# Note that the test set contains all the data so obviously it includes the
# training data...since the training data is so limited, it likely will have
# little effect on the outcome though
'''
_, _, x_test, y_test = separate_into_sets(x, y, training_proportion = 0)
x_train, y_train, _, _ = \
separate_into_sets(balanced_x, balanced_y, training_proportion = 0.7)
'''
x_train, y_train, x_test, y_test = prep_time_series_input. \
separate_into_sets(windowed_x, windowed_y, positive_proportion = 0.7)
print "Number of samples (training and testing): ", str(num_samples)
print "Number of windows per sample (training and testing): ", str(windows_per_sample)
print "x_train, y_train shapes: ", x_train.shape, y_train.shape
print "x_test, y_test shapes: ", x_test.shape, y_test.shape
weighted_y_train = np.copy(y_train)
weighted_y_train[weighted_y_train == 1] = 6
weighted_y_test = np.copy(y_test)
weighted_y_test[weighted_y_test == 1] = 6
# TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION
# (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER
'''
ADD K-FOLD CROSS VALIDATION SOON...NOT NECESSARY RIGHT NOW FOR TESTING PURPOSES
BUT DEFINITELY SHOULD DO IT FOR THE FINAL EVALUATION OF THE MODEL.
https://machinelearningmastery.com/evaluate-performance-deep-learning-models-keras/
http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html
'''
model = create_model(x_train, weighted_y_train, pcap_duration, step_length, \
save_model=True, savefile=model_savefile)
"""
model = load_model(model_savefile, custom_objects = \
{'true_positives': true_positives, 'false_positives': false_positives, \
'true_negatives': true_negatives, 'false_negatives': false_negatives, \
'true_positive_rate': true_positive_rate, \
'false_positive_rate': false_positive_rate, \
'true_negative_rate': true_negative_rate, \
'false_negative_rate': false_negative_rate})
"""
evaluate_model(model, x_test, y_test, pcap_duration, step_length)
generate_roc_curve(model, x_test, y_test, data_scenario, model_scenario, \
savefile = 'stateless_lstm_22_features_model_scenario_' + str(model_scenario) \
+ '_data_scenario_' + str(data_scenario) + '_interval_' \
+ str(interval_length) + '_step_' + str(step_length) + '.png')
main()