-
Notifications
You must be signed in to change notification settings - Fork 3
/
generator.py
155 lines (126 loc) · 4.95 KB
/
generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import numpy as np
import pandas as pd
import random
# All preprocessing steps that are defined within this module are with
# the understanding that these flow files are encoded in the .binetflow
# file format that is available within the Argus netflow utility.
# https://www.qosient.com/argus/argusnetflow.shtml
# The dataset that was used in this research effort is the CTU-13 dataset.
# Included in the provided link is an explanation of the dataset along with
# an explanation of the features that are available within the dataset.
def strip(text):
"""Strip white space from text
Arguments:
text {string} -- string of text to strip white space from
Returns:
string
"""
return text.strip()
# Work left to be done on the potential preprocessing
# steps that can be used for feature engineering
def sort_ip_flow(df: pd.DataFrame, ip: str) -> dict:
"""Match IP against a flow srcIP
Arguments:
ip {string} -- string representation of an IP address (e.g. 192.168.1.1)
"""
flow_list = []
for flow in df:
if ip == flow[1][3]:
flow_list.append(flow)
return {ip: flow_list}
# Legacy hashing functions, might not be useful anymore
# though it might be useful later on so I will keep them
# in the code for now
def process_flow(flow):
"""Create tokens of flow data
Arguments:
flow {[type]} -- [description]
"""
# create hashes of values
proto_hash = hasher(flow[1][2])
srcip_hash = hasher(flow[1][3])
srcprt_hash = hasher(flow[1][4])
dstip_hash = hasher(flow[1][6])
dstprt_hash = hasher(flow[1][7])
flow_list = list(flow)
# Insert hashes as entry in tuple for each flow
flow_list.insert(4, (str(proto_hash), str(srcip_hash), str(srcprt_hash),
str(dstip_hash), str(dstprt_hash)))
# Re-cast flow entry as tuple w/ added hash tuple
flow = tuple(flow_list)
return(flow)
def dataframe(filenames: list):
"""[summary]
Arguments:
filename {str} -- [description]
"""
flowdata = pd.DataFrame()
for file in filenames:
frame = pd.read_csv(file, sep=',', header=0)
flowdata = flowdata.append(frame, ignore_index=True)
flowdata.rename(columns=lambda x: x.strip(), inplace=True)
return flowdata
def split_cols(dataframe: pd.DataFrame):
"""Subsample a dataframe of netflow data and return a tuple of
subsampled data, labels, and a combination dataframe of both as well
Arguments:
dataframe {pd.DataFrame} -- [description]
Returns:
[type] -- [description]
"""
categories = dataframe.loc[:,['Proto', 'SrcAddr', 'DstAddr','Dport']]
labels = dataframe.loc[:,['Label']]
categories_and_labels = dataframe.loc[:,['Proto', 'SrcAddr', 'DstAddr',
'Dport', 'Label']]
return categories, labels, categories_and_labels
def create_corpora(dataframe: pd.DataFrame, window: int, corpus_count: int):
"""Create corpora of network flows for use in training a model
Arguments:
dataframe {pd.DataFrame} -- DataFrame to split into corpora
window {int} -- window size
corpus_count {int} -- how many corpora to create
Returns:
[list] -- array of corpora (corpus)
"""
corpus = [] # type: List[dataframe]
corpora = []
beginning = 0
end = window
for i in range(corpus_count):
corpus = dataframe.iloc[beginning:end]
corpora.append(corpus)
beginning = end + 1
end += window
return corpora
def generate_batch(batch_size: int, num_skips: int, skip_window: int):
"""Generate a batch for training
Arguments:
batch_size {int} -- batch_size for training dataset
num_skips {int} -- number of skips
skip_window {int} -- size of window of surrounding tokens
"""
global data_index
data_index = 0
# match these parameters to initial word2vec window
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1
buffer = collections.deque(maxlen=span)
if data_index + span > len(data):
data_index = 0
buffer.extend(data[data_index:data_index + span])
data_index += span
for i in range(batch_size // num_skips):
context_words = [w for w in range(span) if w != skip_window]
words_to_use = random.sample(context_words, num_skips)
for j, context_words in enumerate(words_to_use):
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[context_words]
if data_index == len(data):
buffer.extend(data[0:span])