-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelper.py
37 lines (33 loc) · 1.28 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from lists import names
def calSimilarity(search):
search_term_list = search.split()
name_terms = []
search_val = []
for name in names:
name_terms += [i for i in name.split() if len(i)>1 and '.' not in i]
for j in search_term_list :
documents = [j]
documents.extend(name_terms)
tfidf_vectorizer = TfidfVectorizer(analyzer="char", token_pattern=u'(?u)\\b\w+\\b')
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
cs = cosine_similarity(tfidf_matrix[0:1],tfidf_matrix)
similarity_list = cs[0][1:]
# print(j, similarity_list)
max_val = max(similarity_list)
if max_val > 0.9 :
loc = np.where(similarity_list==max_val)
i = loc[0][0]
search_val.append(name_terms[i])
return search_val
def calSimilarity_words(w1,w2,thr=0.7):
documents = [w1,w2]
tfidf_vectorizer = TfidfVectorizer(analyzer="char", token_pattern=u'(?u)\\b\w+\\b')
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
cs = cosine_similarity(tfidf_matrix[0],tfidf_matrix[1])
if cs[0] > thr:
return True
else:
return False