forked from atharva1996/kmeanscluster
-
Notifications
You must be signed in to change notification settings - Fork 0
/
processing.py
202 lines (170 loc) · 6.64 KB
/
processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import sys
import index
import json
import re, string
import copy
from nltk.corpus import stopwords
import nltk
import numpy as np
import matplotlib.pyplot as plt
import random
#for graphs
plotly = False
try:
import plotly
from plotly.graph_objs import Scatter, Scatter3d, Layout
except ImportError:
print ("INFO: Plotly is not installed, plots will not be generated.")
regex = re.compile('[%s]' % re.escape(string.punctuation))
cachedStopWords = stopwords.words('english')
class kMeans():
def __init__(self, seeds, tweets):
self.seeds = seeds
self.tweets = tweets
self.max_iterations = 1000
self.k = len(seeds)
self.clusters = {} # cluster to tweetID
self.rev_clusters = {} # reverse index, tweetID to cluster
self.jaccardMatrix = {} # stores pairwise jaccard distance in a matrix
self.initializeClusters()
self.initializeMatrix()
self.gx = []
self.gy = []
def jaccardDistance(self, setA, setB):
# Calcualtes the Jaccard Distance of two sets
try:
return 1 - float(len(setA.intersection(setB))) / float(len(setA.union(setB)))
except TypeError:
print ('Invalid type. Type set expected.')
def bagOfWords(self, string):
# Returns a bag of words from a given string
# Space delimited, removes punctuation, lowercase
# Cleans text from url, stop words, tweet @, and 'rt'
words = string.lower().strip().split(' ')
for word in words:
word = word.rstrip().lstrip()
if not re.match(r'^https?:\/\/.*[\r\n]*', word) \
and not re.match('^@.*', word) \
and not re.match('\s', word) \
and word not in cachedStopWords \
and word != 'rt' \
and word != '':
yield regex.sub('', word)
def initializeMatrix(self):
# Dynamic Programming: creates matrix storing pairwise jaccard distances
for ID1 in self.tweets:
self.jaccardMatrix[ID1] = {}
bag1 = set(self.bagOfWords(self.tweets[ID1]['text']))
for ID2 in self.tweets:
if ID2 not in self.jaccardMatrix:
self.jaccardMatrix[ID2] = {}
bag2 = set(self.bagOfWords(self.tweets[ID2]['text']))
distance = self.jaccardDistance(bag1, bag2)
self.jaccardMatrix[ID1][ID2] = distance
self.jaccardMatrix[ID2][ID1] = distance
def initializeClusters(self):
# Initialize tweets to no cluster
for ID in self.tweets:
self.rev_clusters[ID] = -1
# Initialize clusters with seeds
for k in range(self.k):
self.clusters[k] = set([self.seeds[k]])
self.rev_clusters[self.seeds[k]] = k
def calcNewClusters(self):
# Initialize new cluster
new_clusters = {}
new_rev_cluster = {}
for k in range(self.k):
new_clusters[k] = set()
for ID in self.tweets:
min_dist = float("inf")
min_cluster = self.rev_clusters[ID]
# Calculate min average distance to each cluster
for k in self.clusters:
dist = 0
count = 0
for ID2 in self.clusters[k]:
dist += self.jaccardMatrix[ID][ID2]
count += 1
if count > 0:
avg_dist = dist/float(count)
if min_dist > avg_dist:
min_dist = avg_dist
min_cluster = k
new_clusters[min_cluster].add(ID)
new_rev_cluster[ID] = min_cluster
return new_clusters, new_rev_cluster
def converge(self):
# Initialize previous cluster to compare changes with new clustering
new_clusters, new_rev_clusters = self.calcNewClusters()
self.clusters = copy.deepcopy(new_clusters)
self.rev_clusters = copy.deepcopy(new_rev_clusters)
# Converges until old and new iterations are the same
iterations = 1
while iterations < self.max_iterations:
new_clusters, new_rev_clusters = self.calcNewClusters()
iterations += 1
if self.rev_clusters != new_rev_clusters:
self.clusters = copy.deepcopy(new_clusters)
self.rev_clusters = copy.deepcopy(new_rev_clusters)
else:
#print iterations
return
def printClusterText(self):
# Prints text of clusters
for k in self.clusters:
for ID in self.clusters[k]:
print (self.tweets[ID]['text'])
print ('\n')
def printClusters(self):
# Prints cluster ID and tweet IDs for that cluster
for k in self.clusters:
print (str(k) + ':' + ','.join(map(str,self.clusters[k])))
def printMatrix(self):
# Prints jaccard distance matrix
for ID in self.tweets:
for ID2 in self.tweets:
print (ID, ID2, self.jaccardMatrix[ID][ID2])
def plotClusters(self):
# Plots Cluster on graph
print ('\n')
x = 10
y = 10
n = -1
start = self.seeds[0]
colors = ['b','g','r','c','m','y','k']
for k,ID1 in zip(self.clusters,self.seeds):
n = n + 1
x = x + self.jaccardMatrix[start][ID1]
y = y + self.jaccardMatrix[start][ID1]
self.gx.append(x)
self.gy.append(y)
#plt.scatter(x, y, c='b', marker='x', alpha=1)
for ID2 in self.clusters[k]:
a = self.gx[n] - self.jaccardMatrix[ID1][ID2]
b = self.gx[n] + self.jaccardMatrix[ID1][ID2]
x = random.uniform(a,b)
a = self.gy[n] - self.jaccardMatrix[ID1][ID2]
b = self.gy[n] + self.jaccardMatrix[ID1][ID2]
y = random.uniform(a,b)
plt.scatter(x, y, c=colors[n%len(colors)], marker='o', alpha=0.5)
plt.show()
def main():
file1 = "tweets.json"
file2 = "seeds.txt"
tweets = {}
with open(file1, 'r') as f:
for line in f:
tweet = json.loads(line)
tweets[tweet['id']] = tweet
f = open(file2)
seeds = [int(line.rstrip(',\n')) for line in f.readlines()]
f.close()
kmeans = kMeans(seeds, tweets)
kmeans.converge()
kmeans.printClusterText()
kmeans.printClusters()
#kmeans.printMatrix()
kmeans.plotClusters()
if __name__ == '__main__':
main()