-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster.py
108 lines (75 loc) · 3.37 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import pandas as pd
import numpy as np
import re
import os
import pickle
import sys
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from sklearn.feature_extraction.text import TfidfVectorizer
from cluster_utils import get_options, read_object, save_object , save_txt
from cluster_utils import clear_log, load_log
from cluster_utils import create_log_tag,create_sub_files,create_view,create_view_test
from collections import Counter
from collections import defaultdict
def tfidf_feature(documents, maxx, minn,documents_test):
vectorizer = CountVectorizer(max_df=maxx, min_df=minn)
transformer = TfidfTransformer()
tf = vectorizer.fit_transform(documents)
tf_test=vectorizer.transform(documents_test)
tfidf_vectors_test = transformer.fit_transform(tf_test)
tfidf_vectors = transformer.fit_transform(tf)
words = vectorizer.get_feature_names() # Get all the words in the word bag model
weight = tfidf_vectors.toarray()
save_txt("words", words)
print("preprocessing finished......")
#printing the shape of tfidf_vectors feature vector for test and train
print(tfidf_vectors.shape,tfidf_vectors_test.shape)
return tfidf_vectors,tfidf_vectors_test
#Calculate the distance between two vectors
def distance(a, b):
sum = 0
for i, j in zip(a, b):
sum += (abs(i - j) ** 2)
return sum ** 0.5
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
def kmeans_cluster(data,data_test,n_clusters):
kmeans = KMeans(init="k-means++", n_clusters=n_clusters,
n_jobs=-1, random_state=0).fit(data)
labels = kmeans.labels_
centers = kmeans.cluster_centers_
labels_test=kmeans.predict(data_test)
return labels,labels_test
def k_elbow(data,k_min,k_max,locate_elbow=True):
model = KMeans(init="k-means++",n_jobs=-1)
visualizer = KElbowVisualizer(model, k=(k_min,k_max), locate_elbow=True)
visualizer.fit(data) # Fit the data to the visualizer
visualizer.poof() # Draw/show/poof the data
def main():
# data = load_log(filename)
# kmeans_cluster(data)
if not os.path.exists("{}_{}_cache".format(filename, clusters)):
os.mkdir("{}_{}_cache".format(filename, clusters))
filename_test='logs/private_test_set.txt' #the filename of the test set on which prediction is to be done.
documents = load_log(filename,chars)
documents_test=load_log(filename_test,chars)
tfidf_vectors,tfidf_vectors_test = tfidf_feature(documents,maxx, minn,documents_test)
labels,labels_test = kmeans_cluster(tfidf_vectors,tfidf_vectors_test,clusters)
# Instantiate the clustering model and visualizer to find the optimal number of clusters.
data=tfidf_vectors #train data
k_elbow(data,2,20,locate_elbow=True)
database, labels_info = create_log_tag(labels, filename)
database_test,labels_info_test=create_log_tag(labels_test,filename_test)
create_view(labels_info, database)
create_view_test(labels_info_test, database_test)
if __name__ == '__main__':
option_dict = get_options()
filename = option_dict['filename']
clusters = option_dict['clusters']
maxx = option_dict['max']
minn = option_dict['min']
chars = option_dict['chars']
main()