-
Notifications
You must be signed in to change notification settings - Fork 7
/
extract_apifeatures.py
49 lines (43 loc) · 1.57 KB
/
extract_apifeatures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import json, glob, os
import numpy as np
from sklearn.ensemble import RandomForestClassifier
apistats_dir = './apistats'
select_number = 80
apis = []
fs = glob.glob(os.path.join(apistats_dir, '*.json'))
for f in fs:
with open(f, 'r') as jsonfile:
data = json.load(jsonfile)
capis = data['apistats']
for api in capis.keys():
if api not in apis:
apis.append(api)
n_samples = len(fs)
n_features = len(apis)
loc = {}
for i in range(n_features):
loc[apis[i]] = i
x = np.zeros((n_samples, n_features))
y = np.zeros((n_samples, ))
for i in range(n_samples):
with open(fs[i], 'r') as jsonfile:
data = json.load(jsonfile)
capis = data['apistats']
cls = data['class']
if cls == 'malware':
y[i] = 1
for api in capis.keys():
x[i, loc[api]] = 1
feat_labels = apis #特征列名
forest = RandomForestClassifier(n_estimators=2000, random_state=0, n_jobs=-1) #2000棵树,并行工作数是运行服务器决定
forest.fit(x, y)
importances = forest.feature_importances_ #feature_importances_特征列重要性占比
indices = np.argsort(importances)[::-1] #对参数从小到大排序的索引序号取逆,即最重要特征索引——>最不重要特征索引
for f in range(x.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))
x = x[:, indices[:select_number]]
xmal = x[np.where(y==1)]
ymal = y[np.where(y==1)]
xben = x[np.where(y==0)]
yben = y[np.where(y==0)]
np.savez('data80.npz', xmal=xmal, ymal=ymal, xben=xben, yben=yben)