-
Notifications
You must be signed in to change notification settings - Fork 0
/
tf_idf_one_class_svm.py
81 lines (60 loc) · 2.13 KB
/
tf_idf_one_class_svm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from os import listdir
from os.path import isfile, join
from Utilities.Text import Text
from Utilities.TermFrequencyInverseDocumentFrequency import TermFrequencyInverseDocumentFrequency as TFIDF
from sklearn import svm
from pprint import pprint
import numpy as np
from sklearn.decomposition import PCA
def main():
path = "./Texts/"
files = [f for f in listdir(path) if isfile(join(path, f))]
tfidf = TFIDF()
for document in files:
tfidf.add_text_to_corpus(Text(path + document))
features, word_list = tfidf.calculate_features_for_corpus()
apply_pca = True
if apply_pca:
pca = PCA(n_components = features.shape[1])
x = {
"train" : pca.fit_transform(features[[0, 2, 4, 5, 6, 7], :]),
"test" : pca.transform(features[[1, 3], :])
}
else:
x = {
"train" : features[[0, 2, 4, 5, 6, 7], :],
"test" : features[[1, 3], :]
}
# Unfortunately, it does not appear to be possible to derive a perfect
# accuracy solution in the grid search specified below. However, it is
# provided here anyway for educational purposes.
grid_search = False
if grid_search:
for kernel in ["rbf", "linear", "sigmoid", "poly"]:
for nu in np.linspace(0.001,1.0,200):
for gamma in np.linspace(0.0,10.0,200):
clf = svm.OneClassSVM(nu = nu, kernel = kernel, gamma = gamma)
clf.fit(x["train"])
y = {
"train" : clf.predict(x["train"]),
"test" : clf.predict(x["test"])
}
if all(y["train"] == 1.0) and all(y["test"] == -1.0):
pprint({"nu" : nu, "gamma" : gamma, "y" : y, "kernel" : kernel})
# The following settings using term-frequency inverse-document frequency
# gives a perfect classification result for the problem of Seneca's
# authorship attribution.
nu, kernel, gamma = 0.84437688442211067, "poly", 0.0
clf = svm.OneClassSVM(nu = nu, kernel = kernel, gamma = gamma)
clf.fit(x["train"])
y = {
"train" : clf.predict(x["train"]),
"test" : clf.predict(x["test"])
}
metrics = {
"train" : clf.decision_function(x["train"]),
"test" : clf.decision_function(x["test"])
}
pprint({"nu" : nu, "gamma" : gamma, "y" : y, "kernel" : kernel, "metrics" : metrics})
if __name__ == "__main__":
main()