-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis_utils.py
executable file
·379 lines (330 loc) · 18.1 KB
/
analysis_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
# coding: utf-8
from datasketch import MinHash, MinHashLSH
from datetime import datetime
from difflib import SequenceMatcher
from matplotlib import font_manager, pyplot as plt
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, ElasticNetCV, LogisticRegressionCV
from sklearn.metrics import make_scorer, auc, roc_auc_score, roc_curve
import time
from read_write_utils import load_joined_df
from utils import *
pos_emojis = "(\:‑\)|\:\)|\:-\]|\:\]|\:-3|\:->|\:>|8-\)|\:-\}|\:\}|\:o\)|\:c\)|\:^\)|=\]|=\)|\:‑D|" \
"\:D|8‑D|x‑D|xD|X‑D|XD|=D|=3|B^D|\:-\)\)|\:'‑\)|\:'\)|\xF0\x9F\x98\x81|\xF0\x9F\x98\x82|" \
"\xF0\x9F\x98\x83|\xF0\x9F\x98\x84|\xF0\x9F\x98\x85|\xF0\x9F\x98\x86|\xF0\x9F\x98\x89|" \
"\xF0\x9F\x98\x8A|\xF0\x9F\x98\x8B|\xF0\x9F\x98\x8C|\xF0\x9F\x98\x8D|\xF0\x9F\x98\x8F|" \
"\xF0\x9F\x91\x8D|\xF0\x9F\x91\x8F|\xF0\x9F\x91\x90|\xF0\x9F\x98\x80|\xF0\x9F\x98\x87|" \
"\xF0\x9F\x91\x8C|\xE2\x9D\xA4|\xE2\x99\xA5)"
neg_emojis = "(\:‑\(|\:\(|\:‑c|\:c|\:‑<|\:<|\:‑\[|\:\[|\:-\|\||>\:\[|\:{|\:@|>\:\(|\:'‑\(|\:'\(|D‑'\:|D\:<|" \
"D\:|D;|D=|\xF0\x9F\x98\x90|\xF0\x9F\x98\x91|\xF0\x9F\x98\x95|\xF0\x9F\x98\x9F|\xF0\x9F\x98\xA6|" \
"\xF0\x9F\x98\xA7|\xF0\x9F\x91\x8E|\xF0\x9F\x98\x9E|\xF0\x9F\x98\xA0|\xF0\x9F\x98\xA1|" \
"\xF0\x9F\x98\xA2|\xF0\x9F\x98\xA3|\xF0\x9F\x98\xA4|\xF0\x9F\x98\xA5e|\xF0\x9F\x98\xA8|" \
"\xF0\x9F\x98\xA9|\xF0\x9F\x98\xAA|\xF0\x9F\x98\xAB|\xF0\x9F\x98\xAD|\xF0\x9F\x98\xB0|" \
"\xF0\x9F\x98\xB1|\xF0\x9F\x98\xB2)"
def similar(a, b):
"""
ratio of similarity between strings
:param a:
:param b:
:return:
"""
return SequenceMatcher(None, str(a), str(b)).ratio()
def remove_similar_tweets(df, text_col="text", lang_col="lang_x", max_jaccard_simularity=0.5):
"""
use local similarity hashing to efficiently remove tweets that are similar to others
(might be autogenerated or retweets)
english tweets only
"""
t0 = time.time()
df["tweet_clean"] = np.vectorize(TweetsAnalysis.preprocess_tweet)(df[text_col], df[lang_col])
tweets = [t.split(" ") for t in df["tweet_clean"]]
t1 = time.time()
print t1 - t0, "cleaned tweets"
lsh = MinHashLSH(threshold=max_jaccard_simularity, num_perm=64) # jaccard similarity
idx_selected = {}
df_indices = df.index.values.tolist()
for idx, tweet in zip(df_indices, tweets):
s = MinHash(num_perm=64)
for word in tweet:
s.update(word.encode('utf8'))
# only add if the tweet is not similar to existing ones
if len(lsh.query(s)) == 0:
lsh.insert(idx, s)
idx_selected[idx] = True
t2 = time.time()
print t2 - t1, "created lsh"
# only select the first tweet in a group of similar tweets
df['select'] = pd.Series([idx_selected.get(idx, False) for idx in df_indices], index=df_indices)
print df["select"].value_counts()
t3 = time.time()
print t3-t2, "selected df"
return df[df["select"]]
def get_most_used_words(df, text_col='text'):
text_list = df[text_col].values.tolist()
text_list = [str(x) for x in text_list] # nan values
# split strings into words
word_list = []
for x in text_list:
word_list += x.split(' ')
word_list = pd.Series(word_list)
return word_list.value_counts()
class TweetsAnalysis:
def __init__(self, begin_time, end_time, root_path='/home/paul/databeers'):
self.beg_time = begin_time
self.end_time = end_time
self.root_path = root_path
def kein_bier_vor_vier(self):
"""
Analyze the German saying "Kein Bier vor 4" (No beer before 4 (p.m.)):
Plot (relative) number of German vs non-German beer tweets dependent on the hour.
:param begin_time:
:param end_time:
:param root_path:
:return:
"""
time_df = load_joined_df(self.beg_time, self.end_time,
columns=['utc_offset', 'created_at', 'lang'], root_path=self.root_path)
print len(time_df)
time_df = time_df[time_df['utc_offset'] != ""]
time_df = add_local_time_col(time_df)
time_df['is_german'] = time_df['lang_x'] == 'de'
german_tweets = sum(time_df['is_german'])
non_ger_tweets = len(time_df) - german_tweets
tweets_abs_hour = (time_df.groupby(
[time_df.local_time.dt.hour, 'is_german'])
.count()['created_at']
.unstack()
)
tweets_rel_hour = pd.DataFrame()
tweets_rel_hour["German"] = 100 * tweets_abs_hour[True] / german_tweets
tweets_rel_hour["Other"] = 100 * tweets_abs_hour[False] / non_ger_tweets
ax = tweets_rel_hour.plot(figsize=(16, 9))
ax.set_xlabel("Hour")
ax.set_ylabel("Share of tweets [%]")
ax.set_xticks(range(0, 24, 2))
return tweets_rel_hour
def brewer_analysis(self):
"""
"Give a man a beer and waste an hour, teach a man to brew and waste a lifetime"
Quantitative analysis of the previous statement, i.e. check if people who tweet about beer a lot are brewers.
:param begin_time: datetime.datetime, begin time for data
:param end_time: datetime.datetime, begin time for data
:param root_path: to the databeers directory(dependent on machine, Paul's Linux or Mac)
:return:
"""
descr_df = load_joined_df(self.beg_time, self.end_time, columns=['text', 'description', 'id_str'],
root_path=self.root_path)
brewer_pattern = u"(?i)brewing|brewer|cervejaria|cervecería|brasserie|brauerei"
descr_df['brewer'] = descr_df['description'].str.contains(brewer_pattern)
avg_tweets = descr_df.groupby(['brewer', 'id_str_y']).count()['id_str_x'].reset_index().groupby(
'brewer').mean()['id_str_x']
ax = avg_tweets.plot(kind='bar')
ax.set_ylabel("Average Number of Tweets")
return avg_tweets
def awd_analysis(self):
"""
Analyze when people in different countries start to drink, in the UK after work drinks (awd) are popular,
whereas in southern europe everything starts later (and also ends later).
:return:
"""
time_df = load_joined_df(self.beg_time, self.end_time,
columns=['utc_offset', 'created_at', 'lang', 'country'], root_path=self.root_path)
time_df = time_df[time_df['utc_offset'] != ""]
time_df = add_local_time_col(time_df, col_out_name='loc_time')
time_df = add_estimated_country(time_df, col_out_name='est_country')
total_tweets_per_country = time_df.groupby('est_country').count()['created_at'].reset_index()
total_tweets_per_country.columns = ['est_country', 'total_tweets']
thirsty_countries = total_tweets_per_country.sort_values(
'total_tweets', ascending=False)['est_country'].values[:9]
awd_tweets_per_country = time_df[time_df.loc_time.dt.weekday <= 3].groupby(
[time_df.loc_time.dt.hour, 'est_country']).count()['created_at'].reset_index()
awd_tweets_per_country.columns = ['hour', 'est_country', 'n_tweets']
joined_df = awd_tweets_per_country.merge(total_tweets_per_country, on='est_country')
joined_df['share_tweets'] = 100 * joined_df['n_tweets'] / joined_df['total_tweets']
plt_df = joined_df.groupby(['hour', 'est_country']).max()['share_tweets'].unstack()
ax = plt_df[[x for x in thirsty_countries if x != '']].plot()
ax.set_ylabel("% After Work Beers / All Beer")
return plt_df
def socializing_analysis(self):
"""
analyze how the number of beer tweets correlates with friends/followers count
:return:
"""
df = load_joined_df(self.beg_time, self.end_time, root_path=self.root_path)[['id_str_y',
u'default_profile',
u'description',
u'favourites_count',
u'followers_count',
u'friends_count']]
cast_cols = ["favourites_count", "followers_count", "friends_count"]
df[cast_cols] = df[cast_cols].apply(pd.to_numeric)
df = df.groupby(["id_str_y"], as_index=False).agg({"description": "count",
"favourites_count": "max",
"followers_count": "max",
"friends_count": "max"}).rename(
columns={"description": "n_tweets"})
plt_df = df.groupby(["n_tweets"]).agg({"favourites_count": "mean",
"followers_count": "mean",
"friends_count": "mean",
"id_str_y": "count"})
ax = plt_df[cast_cols].plot(xlim=(1, 10))
ax.set_ylabel("Average Number")
return plt_df
@staticmethod
def preprocess_tweet(raw_tweet, language_code, remove_stopwords=True, return_string=True):
"""
preprocess tweet for count vectorizer of the text, this means removing stopwords
:param raw_tweet:
:param language_code:
:param remove_stopwords: Boolean if stopwords should be removed
:param return_string: Boolean if True returns a string, else a list of words.
:return: string (if return string) else list
"""
# Remove non-letters
letters_only = re.sub("[^a-zA-Z#]", " ", raw_tweet)
# Convert to lower case, split into individual words
words = letters_only.lower().split()
if remove_stopwords:
language_code_dict = {"es": "spanish", "pt": "portuguese", "fr": "french", "de": "german", "it": "italian",
"nl": "dutch", "tr": "turkish", "ru": "russian"}
# Use set since it is hash based and hence faster
stops = set(stopwords.words(language_code_dict.get(language_code, "english"))).union({"xd"})
# Remove stop words
meaningful_words = [w for w in words if w not in stops]
else:
meaningful_words = words
if return_string:
return " ".join(meaningful_words)
else:
return meaningful_words
def sentiment_analysis(self, n_words=1000, plot_top_n=10):
"""
The idea is to get which words have a postive and a negative connotation in terms of beers.
For this we vectorize the text and learn a logistic regression (a LR is chosen for good interpretability)
To get a sparse ouput we use L1 regularisation.
As a ground truth we take for positive tweets, pos emojis and neg emojis for neg tweets.
We do this for english tweets only
:return:
"""
df = load_joined_df(self.beg_time, self.end_time, root_path=self.root_path,
columns=["text", "id_str", "lang"])
df = df[df["lang_x"] == "en"]
df["pos"] = df['text'].str.contains(pos_emojis)
df["neg"] = df['text'].str.contains(neg_emojis)
# filter the df, first get tweets with emojis only, then remove retweets and some auto-generated tweets
filtered_df = df[np.vectorize(np.logical_xor)(df["pos"], df["neg"])]
filtered_df = filtered_df[~filtered_df["text"].str.upper().str.contains("RT @")]
# some users often post a pretty similar tweet, remove all but one of these.
# since it is computationally to expensive to compare all tweets with each other we only compare with the next
# and repeat this step 3 times
for _ in range(3):
filtered_df['text_lead'] = filtered_df.groupby(['user_id'])['text'].shift(1)
filtered_df["sim"] = np.vectorize(similar)(filtered_df["text"], filtered_df["text_lead"])
filtered_df = filtered_df[filtered_df["sim"] < 0.7]
# another problem are auto-generated tweets that are not completely the same but similar (e.g. tweets
# generated by an app). To capture these we do something similar as above
for _ in range(2):
filtered_df['text_lead'] = filtered_df.sort_values(['text'])['text'].shift(1)
filtered_df["sim"] = np.vectorize(similar)(filtered_df["text"], filtered_df["text_lead"])
filtered_df = filtered_df[filtered_df["sim"] < 0.7]
filtered_df["text_reversed"] = filtered_df["text"].map(lambda x: x[::-1])
for _ in range(2):
filtered_df['text_lead'] = filtered_df.sort_values(['text_reversed'])['text'].shift(1)
filtered_df["sim"] = np.vectorize(similar)(filtered_df["text"], filtered_df["text_lead"])
filtered_df = filtered_df[filtered_df["sim"] < 0.7]
filtered_df["cleaned_text"] = np.vectorize(self.preprocess_tweet)(filtered_df['text'], filtered_df['lang_x'])
vectorizer = CountVectorizer(analyzer="word",
tokenizer=None,
preprocessor=None,
stop_words=None,
max_features=n_words)
msk = np.random.rand(len(filtered_df)) < 0.9
train_data_features = vectorizer.fit_transform(filtered_df["cleaned_text"][msk])
test_data_features = vectorizer.transform(filtered_df["cleaned_text"][~msk])
df_uq = pd.DataFrame(train_data_features.toarray())
df_uq["label"] = pd.Series(filtered_df[msk]["pos"].map({True: 1, False: 0}).values)
df_uq = df_uq.drop_duplicates()
# choose L1 regularisation in order to get a sparse solution
lr_model = LogisticRegressionCV(penalty='l1', solver="liblinear", scoring=make_scorer(roc_auc_score))
lr_model.fit(df_uq[range(n_words)].values, df_uq["label"].values)
roc_data = roc_curve(pd.to_numeric(filtered_df[msk]["pos"]),
[x[1] for x in lr_model.predict_proba(train_data_features.toarray())])
print "auc on train set", auc(roc_data[0], roc_data[1])
roc_data = roc_curve(pd.to_numeric(filtered_df[~msk]["pos"]),
[x[1] for x in lr_model.predict_proba(test_data_features.toarray())])
print "auc on test set", auc(roc_data[0], roc_data[1])
coefs = sorted(zip(lr_model.coef_[0], vectorizer.get_feature_names()), key=lambda x: x[0])
fig, axes = plt.subplots(ncols=2)
y = range(plot_top_n)
axes[0].barh(y, [x[0] for x in coefs[-plot_top_n:]], align='center', color='green', zorder=10)
axes[0].set(title='Positive Connotated Words')
negs = coefs[:plot_top_n]
negs.reverse()
axes[1].barh(y, [abs(x[0]) for x in negs], align='center', color='red', zorder=10)
axes[1].set(title='Negative Connotated Words')
axes[0].set_ylim((-1, plot_top_n))
axes[1].set_ylim((-1, plot_top_n))
x_lim = max(coefs[0][0], coefs[-1][0]) * 1.01
axes[0].set_xlim((0, x_lim))
axes[1].set_xlim((0, x_lim))
axes[0].invert_xaxis()
axes[0].set(yticks=y, yticklabels=[x[1] for x in coefs[-plot_top_n:]])
axes[0].yaxis.tick_right()
axes[1].set(yticks=y, yticklabels=[x[1] for x in negs])
for ax in axes.flat:
ax.margins(0.03)
ax.grid(True)
fig.tight_layout()
fig.subplots_adjust(wspace=0.59)
def emoji_analysis(self):
"""
easy version of sentiment analysis only analysing the emojis used in the tweet
:return: two pandas Series with the emojis as keys and the number of occurences as values.
"""
df = load_joined_df(self.beg_time, self.end_time, root_path=self.root_path, columns=['text', 'id_str'])
pos = df['text'].str.extract(pos_emojis).value_counts()
pos["pos"] = sum(pos)
neg = df['text'].str.extract(neg_emojis).value_counts()
neg["neg"] = sum(neg)
# Unfortunately it is not easy to plot this here, since matplotlib has problems displaying the emojis,
# however on a linux machine there is a solution that is described in
# http://stackoverflow.com/questions/33190985/substitute-node-labels-with-emoji-using-networkx-in-python/
# 33204587#33204587 - the package that is used there is only available on a linux.
# Hence we export the data plot here and add a separate plotting function.
return pos, neg
def plot_emoji_analysis(pos, neg, plot_top_n):
"""
plotting function for TweetsAnalysis.emoji_analysis
:param pos: pd.Series
:param neg: pd.Series
:param plot_top_n: int
:return:
"""
fig, axes = plt.subplots(ncols=2)
y = range(plot_top_n)
axes[0].barh(y, pos.values[:plot_top_n], align='center', color='green', zorder=10)
axes[0].set(title='Positive Emojis {}'.format(pos["pos"]))
axes[1].barh(y, neg.values[:plot_top_n], align='center', color='red', zorder=10)
axes[1].set(title='Negative Emojis {}'.format(neg["neg"]))
axes[0].set_ylim((-1, plot_top_n))
axes[1].set_ylim((-1, plot_top_n))
x_lim = max(max(pos), max(neg)) * 1.01
axes[0].set_xlim((0, x_lim))
axes[1].set_xlim((0, x_lim))
axes[0].invert_xaxis()
axes[0].set(yticks=y, yticklabels=list(pos.index[:plot_top_n]))
for label in axes[0].get_yticklabels():
label.set_fontproperties(font_manager.FontProperties(family="Symbolia"))
axes[0].yaxis.tick_right()
axes[1].set(yticks=y, yticklabels=list(neg.index[:plot_top_n]))
for label in axes[1].get_yticklabels():
label.set_fontproperties(font_manager.FontProperties(family="Symbolia"))
for ax in axes.flat:
ax.margins(0.03)
ax.grid(True)
fig.tight_layout()
fig.subplots_adjust(wspace=0.19)