-
Notifications
You must be signed in to change notification settings - Fork 58
/
Copy pathp17_26.py
41 lines (30 loc) · 1.17 KB
/
p17_26.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from typing import List, Tuple
from collections import defaultdict
def sparse_similarity(docs_raw: List[Tuple[int, List[int]]]) -> None:
doc_dict = dict(docs_raw)
rev_index = defaultdict(set)
for doc_id, doc_words in doc_dict.items():
for word in doc_words:
rev_index[word].add(doc_id)
intersect_of_pairs = defaultdict(int)
for doc_id, doc_words in doc_dict.items():
for word in doc_words:
for other_doc in rev_index[word]:
# We assume an ordering of the doc ids,
# as to prevent duplicate counting
if other_doc <= doc_id:
continue
intersect_of_pairs[(doc_id, other_doc)] += 1
print(f"ID1, ID2:\tSIMILARITY")
for (doc1, doc2), in_common in intersect_of_pairs.items():
union_size = len(doc_dict[doc1]) + len(doc_dict[doc2]) - in_common
docs_simil = in_common / union_size
print(f"{doc1:3}, {doc2:3}:\t{docs_simil}")
if __name__ == "__main__":
ex = [
(13, [14, 15, 100, 9, 3]),
(16, [32, 1, 9, 3, 5]),
(19, [15, 29, 2, 6, 8, 7]),
(24, [7, 10])
]
sparse_similarity(ex)