-
Notifications
You must be signed in to change notification settings - Fork 54
/
Copy pathsort_facts_by_LSA_tSNE.py
56 lines (43 loc) · 1.82 KB
/
sort_facts_by_LSA_tSNE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# -*- coding: utf-8 -*-
'''
Сортировка списка предложений через последовательное применение LSA и t-SNE (встраивание
векторов LSA в 1d)
'''
from __future__ import division # for python2 compatability
from __future__ import print_function
import codecs
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
input_path = r'e:\polygon\paraphrasing\data\facts4_1s.txt'
output_path = '../tmp/facts4_1s.txt'
LSA_DIMS = 60
def v_cosine(a, b):
return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))
print('Buidling tf-idf corpus...')
tfidf_corpus = set()
with codecs.open(input_path, 'r', 'utf-8') as rdr:
for line in rdr:
phrase = line.strip()
if len(phrase) > 0:
tfidf_corpus.add(phrase)
tfidf_corpus = list(tfidf_corpus)
print('{} phrases in tfidf corpus'.format(len(tfidf_corpus)))
print('Fitting LSA...')
vectorizer = TfidfVectorizer(max_features=None, ngram_range=(3, 5), min_df=1, analyzer='char')
svd_model = TruncatedSVD(n_components=LSA_DIMS, algorithm='randomized', n_iter=20, random_state=42)
svd_transformer = Pipeline([('tfidf', vectorizer), ('svd', svd_model)])
svd_transformer.fit(tfidf_corpus)
print('Calculating LSA vectors for query phrases...')
phrase_ls = svd_transformer.transform(tfidf_corpus)
print('Running t-SNE')
tsne = TSNE(n_components=1)
phrases_1d = tsne.fit_transform(phrase_ls)
print('Printing results')
with codecs.open(output_path, 'w', 'utf-8') as wrt:
phrases = [(tfidf_corpus[i], phrases_1d[i]) for i in range(len(tfidf_corpus))]
phrases = sorted(phrases, key=lambda z: z[1])
for phrase, _ in phrases:
wrt.write(u'{}\n'.format(phrase))