-
Notifications
You must be signed in to change notification settings - Fork 0
/
mk-indices.py
49 lines (39 loc) · 1.07 KB
/
mk-indices.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/python
import bz2
import sys
import cdb
def progress(i, n=10000):
i = iter(i)
while True:
try:
for _i in xrange(10000):
i.next()
sys.stdout.write('.')
sys.stdout.flush()
except StopIteration:
print
return
def make_indices(path):
f = bz2.BZ2File(path.replace('.xml.bz2', '-index.txt.bz2'))
id_path = '%s.ids' % path
title_path = '%s.titles' % path
offset_path = '%s.offsets' % path
id_db = cdb.cdbmake(id_path, id_path + '.tmp')
title_db = cdb.cdbmake(title_path, title_path + '.tmp')
offset_db = cdb.cdbmake(offset_path, offset_path + '.tmp')
def build():
for line in f:
(bytes, id, title) = line[:-1].split(':', 2)
id_db.add(id, title)
title_db.add(title, id)
offset_db.add(id, bytes)
yield
progress(build())
id_db.finish()
title_db.finish()
offset_db.finish()
def main():
path = sys.argv[1]
make_indices(path)
if __name__ == '__main__':
main()