-
Notifications
You must be signed in to change notification settings - Fork 0
/
classify.py
162 lines (129 loc) · 5.07 KB
/
classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import collections
import itertools
import logging
import stopwords
import storage
MOST_COMMON_COUNT = 41
ADD_CONTENT_LABELS = False
logger = logging.getLogger(__name__)
def shorten(label):
if len(label) > 31:
label = label[:13] + ' ... ' + label[-13:]
return label
class Label(storage.Data):
def __init__(self, name):
super(Label, self).__init__(name)
self.kind = 'label'
self.color = '#888'
self.font_size = 12
def is_related_item(self, other):
return self.label in other.words
def is_duplicate(self, duplicates):
if self.label in duplicates:
return True
duplicates.add(self.label)
return False
class TooMuch(storage.Data):
def __init__(self, count):
super(TooMuch, self).__init__('Search is too broad: %d results not shown.' % count)
self.kind = 'label'
self.color = 'red'
self.font_size = 48
def add_contact(contact, contacts):
if contact.label in contacts:
contact = contacts[contact.label]
else:
contacts[contact.label] = contact
return contact
def get_persons(items):
return [(contact, item) for item in items for contact in item.persons]
def adjacent(items):
for index in range(len(items) - 2):
yield (items[index], items[index + 1])
def get_item_edges(items):
# type(list, str) -> list
timestamps = [item.timestamp for item in items if item.timestamp]
if timestamps:
storage.TimeNode.set_timerange(min(timestamps), max(timestamps))
return {
(item, related)
for item in items
for related in item.get_related_items()
}
def remove_duplicates(items, keep_duplicates):
# type(list, bool) -> list
duplicates = set()
sorted_items = sorted(items, key=lambda item: -(item.timestamp or 0))
results = [item for item in sorted_items if keep_duplicates or not item.is_duplicate(duplicates)]
if len(results) > storage.MAX_NUMBER_OF_ITEMS:
too_much = TooMuch(len(results) - storage.MAX_NUMBER_OF_ITEMS)
results = results[:storage.MAX_NUMBER_OF_ITEMS]
results.append(too_much)
return results
def get_most_common_words(query, items):
counter = collections.Counter()
for item in items:
item.update_words(items)
counter.update([
word
for word in item.words
if word and word != query
])
for word in query.lower().split(' '):
if word in counter:
del counter[word]
most_common = {key for key, count in counter.most_common(MOST_COMMON_COUNT)}
return most_common
def get_edges(query, items, add_words=False, keep_duplicates=False):
# type(str, list, str, bool) -> (list, list)
edges = get_item_edges(items)
if add_words:
items += [Label(word) for word in get_most_common_words(query, items) if not stopwords.is_stopword(word)]
for item1, item2 in itertools.combinations(items, 2):
if item1.is_related_item(item2) or item2.is_related_item(item1):
item1.edges += 1
item2.edges += 1
logger.debug(" - add edge %s %s %s - %s" % ( item1.is_related_item(item2), item2.is_related_item(item1), item1, item2))
edges.add((item1, item2))
items = remove_duplicates(items, keep_duplicates)
edges = [edge for edge in edges if edge[0] in items and edge[1] in items]
logger.info("Created graph for %d edges and %d items, with %d emails" % (len(edges), len(items), len(list(filter(lambda item: item.kind == "gmail", items)))))
return list(edges), items
def debug_results(labels, items):
show_details = False
level = logging.get_level()
logging.set_level(logging.DEBUG)
logging.debug('found %d labels with %d items' % (len(labels), len(items)))
logging.debug('Included:')
def shorten(x):
if isinstance(x, str): return x.replace('\n', ' ')
return x
for item in items:
logging.debug(' %s %s %s' % (item.kind, repr(item.label), item.uid))
if show_details:
for var in vars(item):
logging.debug(' %s: %s' % (var, shorten(getattr(item, var))))
for k,v in labels.items():
logging.debug(k.label)
for item in v:
logging.debug (' %s %s' % (item.kind, item.label))
logging.debug('Removed:')
for item in set(all_items) - set(items):
logging.debug(' %s %s %s' % (item.kind, repr(item.label), item.uid))
if show_details:
for var in vars(item):
logging.debug(' %s: %s' % (var, shorten(getattr(item, var))))
logging.set_level(level)
if __name__ == '__main__':
level = logging.get_level()
logging.set_level(logging.DEBUG)
query = 'blockchain'
items = storage.Storage.search(query, 3)
logging.debug('Found %d items.' % len(items))
edges, all_items = get_edges(items)
logging.debug('Edges:')
for item1, item2 in edges:
logging.debug(' %s - %s' % (repr(item1.label), repr(item2.label)))
logging.debug('Items:')
for item in items:
logging.debug(' %s' % repr(item.label))