-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtopic_classification.py
76 lines (63 loc) · 2.87 KB
/
topic_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from modules.svd import apply_svd_basic
def define_key_words(data, all_unique_words):
"""
Defines the keywords of ll topics.
:param data: term document matrix with applied tf-idf
:param all_unique_words: unique terms in all docs
:return: topics main words
"""
u, s, vh = apply_svd_basic(data)
# creating empty list topics, in which each list will correspond
# to certain topic. And values in these lists will be coefficients of connection
# of each word with this topic
# (So we will have now 10 lists (10 topics) with 511 values in each (terms))
topics = [[] for _ in range(len(u[0]))]
for word_index, word_coef_list in enumerate(u):
for i in range(len(word_coef_list)):
topics[i].append(word_coef_list[i])
for lst in range(len(topics)):
for val in range(len(topics[lst])):
topics[lst][val] = abs(topics[lst][val])
words = [[] for x in range(len(topics))]
for topic in range(len(topics)):
max_coef = max(topics[topic])
max_coef_index = topics[topic].index(max_coef)
key_words = [all_unique_words[max_coef_index]]
second_max_coef = 0
second_max_coef_index = 0
for j in range(len(topics[topic])):
if topics[topic][j] > second_max_coef\
and all_unique_words[j] not in key_words:
second_max_coef = topics[topic][j]
second_max_coef_index = j
key_words.append(all_unique_words[second_max_coef_index])
third_max_coef = 0
third_max_coef_index = 0
for k in range(len(topics[topic])):
if topics[topic][k] > third_max_coef \
and all_unique_words[k] not in key_words:
third_max_coef = topics[topic][k]
third_max_coef_index = k
key_words.append(all_unique_words[third_max_coef_index])
fourth_max_coef = 0
fourth_max_coef_index = 0
for k in range(len(topics[topic])):
if topics[topic][k] > fourth_max_coef \
and all_unique_words[k] not in key_words:
fourth_max_coef = topics[topic][k]
fourth_max_coef_index = k
key_words.append(all_unique_words[fourth_max_coef_index])
fifth_max_coef = 0
fifth_max_coef_index = 0
for k in range(len(topics[topic])):
if topics[topic][k] > fifth_max_coef \
and all_unique_words[k] not in key_words:
fifth_max_coef = topics[topic][k]
fifth_max_coef_index = k
key_words.append(all_unique_words[fifth_max_coef_index])
words[topic] = [key_words[0],
key_words[1],
key_words[2],
key_words[3],
key_words[4]]
return words