-
Notifications
You must be signed in to change notification settings - Fork 6
/
utils.py
executable file
·91 lines (73 loc) · 2.69 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
Various util functions.
Including code from textrank, which is licensed under the MIT License.
'''
from itertools import combinations as combinations
from Queue import Queue as _Queue
import re
WINDOW_SIZE = 2
def get_first_window(split_text):
return split_text[:WINDOW_SIZE]
#tokens is a list of words
def set_graph_edge(graph, tokens, word_a, word_b):
if word_a in tokens and word_b in tokens:
edge = (word_a, word_b)
if graph.has_node(word_a) and graph.has_node(word_b) and not graph.has_edge(*edge):
graph.add_edge(*edge)
def process_first_window(graph, tokens, split_text):
first_window = get_first_window(split_text)
for word_a, word_b in combinations(first_window, 2):
set_graph_edge(graph, tokens, word_a, word_b)
def init_queue(split_text):
queue = _Queue()
first_window = get_first_window(split_text)
for word in first_window[1:]:
queue.put(word)
return queue
def process_word(graph, tokens, queue, word):
for word_to_compare in queue_iterator(queue):
set_graph_edge(graph, tokens, word, word_to_compare)
def update_queue(queue, word):
queue.get()
queue.put(word)
assert queue.qsize() == (WINDOW_SIZE - 1)
def process_text(graph, tokens, split_text):
queue = init_queue(split_text)
for i in xrange(WINDOW_SIZE, len(split_text)):
word = split_text[i]
process_word(graph, tokens, queue, word)
update_queue(queue, word)
def queue_iterator(queue):
iterations = queue.qsize()
for i in xrange(iterations):
var = queue.get()
yield var
queue.put(var)
def set_graph_edges(graph, tokens, split_text):
process_first_window(graph, tokens, split_text)
process_text(graph, tokens, split_text)
#retuns dictionary of dictionaries: {topic i : {word: count in given topic i }}
def parse_weights_from_file (filename):
topics_dict ={}
count = 0
with open(filename) as f:
for line in f:
single_topic ={}
line_list =re.split(r',+', line)
for el in line_list:
split_el = re.split(r'\t+', el)
if len(split_el)==2:
single_topic[split_el[0]]= int(split_el[1])
topics_dict[count] = single_topic
count= count + 1
return topics_dict
def load_docsXtopics_from_file (filename):
docsXtopics_list = []
with open(filename) as f:
for line in f:
line_list =re.split(r',+', line)
line_list =[float(el) for el in line_list[:-1]]
docsXtopics_list.append(line_list)
return docsXtopics_list