-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecommendation.py
126 lines (100 loc) · 4.3 KB
/
recommendation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 20 15:49:47 2018
@author: vivekmishra
"""
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import re
import string
import unicodedata
import nltk
nltk.download('words')
from nltk.tokenize.toktok import ToktokTokenizer
nltk.download('stopwords')
stopword_list = nltk.corpus.stopwords.words('english')
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
nlp = spacy.load('en', parse=True, tag=True, entity=True)
import operator
#vid = 'iUdgD8kYU-E'
class recommendation:
def getRecommendation(self,vid,df,feature_df):
cluster_no = feature_df[feature_df['id'] == vid]['clusters']
cluster_no = cluster_no.item()
#select data of that cluster
feature_df = feature_df.set_index('id')
slice_df = feature_df[feature_df['clusters'] == cluster_no]
#Here we do all feature engineering again for this cluster
title_temp = list(df[df['clusters'] == cluster_no]['title'])
counter = 0
for sent in title_temp:
title_temp[counter]=self.strip_links(sent)
counter += 1
counter = 0
for sent in title_temp:
title_temp[counter]=self.strip_hashtag(sent)
counter += 1
counter = 0
for sent in title_temp:
title_temp[counter]=self.remove_special_characters(sent,
remove_digits=True)
counter += 1
counter = 0
for sent in title_temp:
title_temp[counter]=self.remove_stopwords(sent)
counter += 1
counter = 0
for sent in title_temp:
title_temp[counter]=self.lemmatize_text(sent)
counter += 1
vectorizer = TfidfVectorizer(strip_accents='unicode')
title_temp_mat = vectorizer.fit_transform(title_temp)
title_temp_mat = title_temp_mat.toarray()
title_temp_mat = pd.DataFrame(title_temp_mat)
#Cosine similarity
cos_sim = cosine_similarity(title_temp_mat.values)
df_temp = pd.DataFrame(cos_sim, columns=slice_df.index.values, index=slice_df.index)
interest_list = dict(df_temp[vid])
index_min = sorted(interest_list.items(), key=operator.itemgetter(1))[:3]
index_max = sorted(interest_list.items(), key=operator.itemgetter(1),reverse=True)[:4]
return index_min,index_max
def strip_links(self,text):
link_regex = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
links = re.findall(link_regex, text)
for link in links:
text = text.replace(link[0], ', ')
return text
def strip_hashtag(self,text):
entity_prefixes = ['#']
for separator in string.punctuation:
if separator not in entity_prefixes :
text = text.replace(separator,' ')
word_list = []
for word in text.split():
word = word.strip()
if word:
if word[0] not in entity_prefixes:
word_list.append(word)
return ' '.join(word_list)
def lemmatize_text(self,text):
text = nlp(text)
text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
return text
def remove_special_characters(self,text, remove_digits=False):
pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
text = re.sub(pattern, ' ', text)
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
def remove_stopwords(self,text, is_lower_case=False):
tokenizer = ToktokTokenizer()
tokens = tokenizer.tokenize(text)
tokens = [token.strip() for token in tokens]
whitelist = ["n't","not", "no"]
if is_lower_case:
filtered_tokens = [token for token in tokens if (token not in stopword_list or token in whitelist)]
else:
filtered_tokens = [token for token in tokens if (token.lower() not in stopword_list or token in whitelist)]
filtered_text = ' '.join(filtered_tokens)
return filtered_text