-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinverted_index.py
102 lines (83 loc) · 2.58 KB
/
inverted_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import re
import json
import pickle
import math
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from os import listdir
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()
# Clean string function
def preprocessing_doc(text) :
link = re.findall(r"<link>(.*?)</link>", lines[0])[0]
title = re.findall(r"<title>(.*?)</title>", lines[1])[0]
body = ''.join(lines[2:])
body = (body.encode('ascii', 'ignore')).decode("utf-8")
body = re.sub("&.*?;", "", body)
body = re.sub(">", "", body)
body = re.sub("[\]\|\[\@\,\$\%\*\&\\\(\)\":]", "", body)
body = re.sub("-", " ", body)
# body = re.sub(r'(?<=[.?!])( +|\Z)', r'\n', body)
body = body.replace('.', '\n')
body = re.sub("^\s+","" ,body)
body = body.replace('\n', ' ').replace('\r', '').replace(' ',' ')
body = body.lower()
body = ' '.join(re.split('\.\s+',body))
body = body.replace("http://", " ").replace("https://", " ")
return link, title, body
CORPUS_PATH = './corpus/'
files = [CORPUS_PATH+f for f in listdir(CORPUS_PATH)]
stopwords = open('./stopwords.txt').read().split('\n')
df_data={}
tf_data={}
idf_data={}
i = 0
for file in files:
tf={}
with open(file,'r') as data:
lines = data.readlines()
link, title, body = preprocessing_doc(lines)
stemmed_body = stemmer.stem(body)
for word in stemmed_body.split():
if word in stopwords:
continue
#tf term frequency
if word in tf :
tf[word] += 1
else :
tf[word] = 1
#df document frequency
if word in df_data :
df_data[word] += 1
else :
df_data[word] = 1
tf_data[link] = {
'title': title,
'tf': tf,
'caption': body[:255]
}
# Calculate Idf
for x in df_data :
idf_data[x] = 1 + math.log10(len(tf_data)/df_data[x])
tf_idf = {}
for word in df_data:
list_doc = []
for data in tf_data:
tf_value = 0
if word in tf_data[data]['tf']:
tf_value = tf_data[data]['tf'][word]
weight = tf_value * idf_data[word]
# print(weight)
doc = {
'url' : data,
'title' : tf_data[data]['title'],
'score' : weight,
'caption': tf_data[data]['caption']
}
if doc['score'] != 0 :
if doc not in list_doc:
list_doc.append(doc)
tf_idf[word] = list_doc
# Write dictionary to file
with open('./index/inverted_index.pickle', 'wb') as file:
pickle.dump(tf_idf, file)