-
Notifications
You must be signed in to change notification settings - Fork 1
/
Indexer.py
242 lines (212 loc) · 8.41 KB
/
Indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import json # how to load json files
import pickle
import os.path
from pathlib import Path
from nltk.stem.snowball import SnowballStemmer
import time
import numpy as np
snow_stemmer = SnowballStemmer(language='english')
lexicon = {}
# get stopwords
a_file = open("stopwords.pkl", "rb")
stop_words = pickle.load(a_file)
a_file.close()
lx_id = 0
doc_count = 0
docid = [[]]
url_check = {}
barrels = {}
def update_invertedindex(filepath):
global lexicon
global lx_id
global docid
global url_check
global barrels
barrels = {}
# load the previous lexicon and get the next wordid and document information
if Path("Lexicon.pkl").is_file():
with open("Lexicon.pkl", 'rb') as l:
lexicon = pickle.load(l)
lx_id = len(lexicon)
if Path("docid.pkl").is_file():
with open("docid.pkl", 'rb') as d:
docid = pickle.load(d)
# get the new file(s) and create forward index
new_files = [o for o in os.listdir(os.getcwd()) if o.endswith('json')]
with open(filepath, 'r') as nf:
f_obj = json.loads(nf.read())
print(len(f_obj))
update_data(f_obj)
# save the updated lexicon and the updated document information
u_file = open("Lexicon.pkl", "wb")
pickle.dump(lexicon, u_file)
u_file.close()
u_file = open("docid.pkl", "wb")
pickle.dump(docid, u_file)
u_file.close()
# update the inverted index
create_invertedindex()
# create inverted index from the forward index barrles
def create_invertedindex():
number_of_barrels = lexicon[list(lexicon)[-1]][0] // 400
arr = [[0] * 400 for a in range(number_of_barrels + 1)] # stores cumulative frequency of each barrel
# only get those barrels that we need to open for updating the inverted index
binids = list(barrels.keys())
# these barrels may have been in opened in any order in case of updating the forward index, so we need to sort them
binids.sort()
i = 0
j = 0
# get the cumulative frequency of words in each barrel
for k in lexicon:
if i == 0:
arr[j][i] = lexicon[k][1]
i += 1
else:
arr[j][i] = arr[j][i - 1] + lexicon[k][1]
i += 1
if i == 400:
i = 0
j += 1
last = 0
for index, a in enumerate(arr[number_of_barrels]):
if a != 0:
last = a
else:
arr[number_of_barrels][index] = last
# store cumulative frequency
a_file = open("arr.pkl", "wb")
pickle.dump(arr, a_file)
a_file.close()
# sort each barrel using counting sort and store in file
# only those barrels are sorted that are open during updating
for i in binids:
sizeofbarrel = arr[i][-1]
listinverted = [[''] * 4 for sb in range(sizeofbarrel)]
# no need to open the forward barrels again since they were opened while building/updating the forward index
# forward files are opened in a+ mode so to read from to start we need to seek to the start of the file
barrels[i].seek(0)
fobj = barrels[i].readlines()
for line in fobj:
url, wid, rank, hits = line.split('#')
arr[i][int(wid)] -= 1
sortplace = arr[i][int(wid)]
listinverted[sortplace][0] = url
listinverted[sortplace][1] = wid
listinverted[sortplace][2] = rank
listinverted[sortplace][3] = hits
barrels[i].close()
with open('InvertedIndex/' + str(i) + '.txt', 'w') as wobj:
for f in listinverted:
wobj.write(str(f[0] + '#' + f[1] + '#' + f[2] + '#' + f[3]))
# divide words of one article into barrels according to wordID
def create_forwardindex(fwdix, single):
docid.append([single['url'], single['title']])
for wid, hits in fwdix.items():
binid = wid // 400 # div of wid by 400 will be our barrelID in which the word id will be stored
newid = wid % 400 # wid mod 400 is the difference from the smallest wordID in a barrel
if binid not in barrels:
barrels[binid] = open('ForwardIndex/' + str(binid) + '.txt', 'a+') # create barrel if don't exists
rank = int(np.sum(np.array(hits), axis=0)[0]) # calculate rank of one document and its one word
# write documentID, wordID, rank and hits for a word in a document in its corresponding barrel
barrels[binid].write(str(len(docid) - 1) + "#" + str(newid) + "#" + str(rank) + "#" + str(hits) + "\n")
# parse through articles then update lexicon and Forward Index
def update_data(obj):
global lx_id
# parse through each article and get words
for i in range(len(obj)):
if obj[i]['url'] in url_check:
continue
url_check[obj[i]['url']] = 0
fx = {}
# get title from one article
t = obj[i]['title']
word = ""
loc = 0
# parse through title and select relevant words
for cr in t:
if cr.isalpha():
word += cr
else:
word = word.lower()
if len(word) <= 2 or word in stop_words: # ******** use word.lower() before this
word = ""
continue
word = snow_stemmer.stem(word)
# if word is not in lexicon then this word is added to lexicon with new word-id and '0' word count
if word not in lexicon:
if word in stop_words:
word = ""
continue
lexicon[word] = [lx_id, 0]
lx_id += 1
wid = lexicon[word][0]
hit = [10, loc] # make hit: '10' shows fancy hit, also add location of word in article
loc += 1
# if the word is found first time in an article then add its hit to fx and increment that word frequency
if wid not in fx:
lexicon[word][1] += 1
fx[wid] = [hit]
# if word is repeated in an article then append the new hit to the previous hits
else:
fx[wid].append(hit)
word = ""
# get content from one article
c = obj[i]['content']
word = ""
loc = 0
# parse through content and select relevant words
for cr in c:
if cr.isalpha():
word += cr
else:
word = word.lower()
if len(word) <= 2 or word in stop_words:
word = ""
continue
word = snow_stemmer.stem(word)
# if word is not in lexicon then this word is added to lexicon with new word-id and '0' word count
if word not in lexicon:
if word in stop_words:
word = ""
continue
lexicon[word] = [lx_id, 0]
lx_id += 1
wid = lexicon[word][0]
# find hit. '1' shows plain hit, also add location of word in article
hit = [1, loc]
loc += 1
if wid not in fx:
lexicon[word][1] += 1
fx[wid] = [hit]
elif len(fx[wid]) <= 6: # more than 6 hits in an article is not allowed
fx[wid].append(hit)
word = ""
create_forwardindex(fx, obj[i]) # create forward index from fx dictionary
if not os.path.exists('Lexicon.pkl'):
cwd = os.getcwd()
if not Path(cwd + "/ForwardIndex").is_dir():
os.makedirs("ForwardIndex")
if not Path(cwd + "/InvertedIndex").is_dir():
os.makedirs("InvertedIndex")
cwd += '/newsdata'
start_time = time.time()
files = [o for o in os.listdir(cwd) if o.endswith('.json')] # get all json files in newsdata directory
# parse through the each json file and update lexicon and forwardIndex
for f in files:
myjsonfile = open(cwd + '/' + f, 'r')
jsondata = myjsonfile.read()
fileobj = json.loads(jsondata)
print(len(fileobj))
myjsonfile.close()
update_data(fileobj)
# save lexicon
a_file = open("Lexicon.pkl", "wb")
pickle.dump(lexicon, a_file)
a_file.close()
# save docIds
a_file = open("docid.pkl", "wb")
pickle.dump(docid, a_file)
a_file.close()
# closing forward index files *********
create_invertedindex()
print("time:", time.time() - start_time)