-
Notifications
You must be signed in to change notification settings - Fork 0
/
nlp.py
281 lines (209 loc) · 6.82 KB
/
nlp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 17 19:49:45 2018
@author: vivekmishra
"""
#PAth
import os
os.chdir('/Users/vivekmishra/Desktop/USC/599-DSS/project')
#imports
import pandas as pd
import numpy as np
import re
import string
import unicodedata
import seaborn as sns
import matplotlib as plt
from nltk.stem import PorterStemmer
from nltk.corpus import words
from sklearn.cluster import KMeans
import scipy
import nltk
nltk.download('words')
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
nltk.download('stopwords')
stopword_list = nltk.corpus.stopwords.words('english')
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
nlp = spacy.load('en', parse=True, tag=True, entity=True)
#recommendation class
from recommendation import recommendation
#read pickle - Contains LDA And sentiment analysis results
df = pd.read_pickle('df_senti.pkl')
#Preproc func
def strip_links(text):
link_regex = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
links = re.findall(link_regex, text)
for link in links:
text = text.replace(link[0], ', ')
return text
def strip_hashtag(text):
entity_prefixes = ['#']
for separator in string.punctuation:
if separator not in entity_prefixes :
text = text.replace(separator,' ')
word_list = []
for word in text.split():
word = word.strip()
if word:
if word[0] not in entity_prefixes:
word_list.append(word)
return ' '.join(word_list)
def lemmatize_text(text):
text = nlp(text)
text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
return text
def remove_special_characters(text, remove_digits=False):
pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
text = re.sub(pattern, ' ', text)
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
def remove_stopwords(text, is_lower_case=False):
tokens = tokenizer.tokenize(text)
tokens = [token.strip() for token in tokens]
whitelist = ["n't","not", "no"]
if is_lower_case:
filtered_tokens = [token for token in tokens if (token not in stopword_list or token in whitelist)]
else:
filtered_tokens = [token for token in tokens if (token.lower() not in stopword_list or token in whitelist)]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
#TF-IDF for title and desc - features into models
title = list(df['title'])
counter = 0
for sent in title:
title[counter]=strip_links(sent)
counter += 1
counter = 0
for sent in title:
title[counter]=strip_hashtag(sent)
counter += 1
counter = 0
for sent in title:
title[counter]=remove_special_characters(sent,
remove_digits=True)
counter += 1
counter = 0
for sent in title:
title[counter]=remove_stopwords(sent)
counter += 1
counter = 0
for sent in title:
title[counter]=lemmatize_text(sent)
counter += 1
vectorizer = TfidfVectorizer(strip_accents='unicode')
title_mat = vectorizer.fit_transform(title)
title_mat = title_mat.toarray()
title_mat = pd.DataFrame(title_mat)
#Desc
desc = list(df['desc'])
counter = 0
for sent in desc:
desc[counter]=strip_links(sent)
counter += 1
counter = 0
for sent in desc:
desc[counter]=strip_hashtag(sent)
counter += 1
counter = 0
for sent in desc:
desc[counter]=remove_special_characters(sent,
remove_digits=True)
counter += 1
counter = 0
for sent in desc:
desc[counter]=remove_stopwords(sent)
counter += 1
counter = 0
for sent in desc:
desc[counter]=lemmatize_text(sent)
counter += 1
#Joining desc and title to form a word dictionary
word_dict = []
counter = 0
for text in title:
tokens_title = tokenizer.tokenize(text)
tokens_title = [token.strip() for token in tokens_title]
desc_text = desc[counter]
tokens_desc = tokenizer.tokenize(desc_text)
tokens_desc = [token.strip() for token in tokens_desc]
merge = tokens_title+tokens_desc
word_list = set()
for item in merge:
word_list.add(item)
word_dict.append(list(word_list))
counter += 1
counter = 0
for item in word_dict:
word_dict[counter] = ' '.join(item)
counter += 1
#Subtitle topic 1
subt = list(df['topic1'])
counter = 0
for item in subt:
subt[counter] = ' '.join(item)
counter += 1
####TF-IDF
vectorizer = TfidfVectorizer(strip_accents='unicode')
word_mat = vectorizer.fit_transform(word_dict)
word_mat = word_mat.toarray()
word_mat = pd.DataFrame(word_mat)
#Feature Selection
## For time being only use title matrix
vectorizer = TfidfVectorizer(strip_accents='unicode')
title_mat = vectorizer.fit_transform(title)
title_mat = title_mat.toarray()
feature_mat = title_mat
feature_df = pd.DataFrame(title_mat)
#######Clustering
#Adding some more features to tf-idf matrix - scaling required
#title_mat['likes'] = df['likes']
#title_mat['dislike'] = df['dislike']
#title_mat['comment'] = df['comment']
#title_mat['senti_title'] = df['senti_title']
#title_mat['senti_desc'] = df['senti_desc']
#title_mat['senti_subt'] = df['senti_subt']
#Conversion of dataframe to spare matrix
no_of_cluster = 5
dense_matrix = np.array(feature_df.as_matrix(columns = None), dtype=bool).astype(np.int)
sparse_matrix = scipy.sparse.csr_matrix(dense_matrix)
kmeans = KMeans(n_clusters=no_of_cluster, random_state=0)
kmeans.fit(sparse_matrix)
print("Top terms per cluster:")
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(5):
print("Cluster %d:" % i),
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind]),
print
clusters = kmeans.labels_.tolist()
#Counter for each cluster - To check cluster distribution
from collections import Counter
el = Counter(clusters)
#Cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
dist = cosine_similarity(feature_mat)
#Silhouette score
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(dist, kmeans.labels_)
#Recommendation
#Testing with video id
df = df.reset_index()
df = df.drop(['index'],axis=1)
vid = 'iUdgD8kYU-E'
feature_df['id'] = df['id']
feature_df['clusters'] = clusters
df['clusters'] = clusters
rec_obj = recommendation()
least_rel,most_rel = rec_obj.getRecommendation(vid,df,feature_df)
print("The titles of most relevent recommendation")
for item in most_rel:
title_str = df[df['id']== item[0]]['title']
print('Title: ' + str(title_str))
print("The title of least relevent recommendation")
for item in least_rel:
title_str = df[df['id']== item[0]]['title']
print('Title: ' + str(title_str))