-
Notifications
You must be signed in to change notification settings - Fork 19
/
prepare.py
138 lines (130 loc) · 4.86 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
""" Normalize and parse.
"""
import os
import shutil
import sys
import tarfile
import tempfile
import time
from normalize_arxiv_dump import normalize
from parse_latex_tralics import parse
def prepare(in_dir, out_dir, meta_db, tar_fn_patt, write_logs=False):
if not os.path.isdir(in_dir):
print('input directory does not exist')
return False
ext_sample = [os.path.splitext(fn)[-1] for fn in os.listdir(in_dir)[:10]]
if '.tar' not in ext_sample:
print('input directory doesn\'t seem to contain TAR archives')
return False
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
done_log_path = os.path.join(out_dir, 'done.log')
done_tars = []
if os.path.isfile(done_log_path):
with open(done_log_path) as f:
lines = f.readlines()
done_tars = [l.strip() for l in lines]
tar_fns = [fn for fn in os.listdir(in_dir) if tar_fn_patt in fn]
tar_total = len(tar_fns)
num_pdf_total = 0
num_files_total = 0
for tar_idx, tar_fn in enumerate(tar_fns):
# for each tar archive
print('{}/{} ({})'.format(tar_idx+1, tar_total, tar_fn))
if tar_fn in done_tars:
print('done in a previous run. skipping')
continue
tar_path = os.path.join(in_dir, tar_fn)
# check if file can be skipped
skip_file = False
# "gracefully" handle input file access (currently a network mount)
num_tries = 1
while True:
# try file access
try:
# try tar
is_tar = False
try:
is_tar = tarfile.is_tarfile(tar_path)
except IsADirectoryError:
print(('unexpected directory "{}" in {}. skipping'
'').format(tar_fn, in_dir))
skip_file = True
if not is_tar:
print(('"{}" is not a TAR archive. skipping'
'').format(tar_fn))
skip_file = True
break # not remote access problems
except IOError as err:
print(('[{}] IO error when trying check tar file: {}'
'').format(num_tries, err))
num_tries += 1
time.sleep(60)
if skip_file:
continue
with tempfile.TemporaryDirectory() as tmp_dir_path:
# prepare folders for intermediate results
tmp_dir_gz = os.path.join(tmp_dir_path, 'flattened')
os.mkdir(tmp_dir_gz)
tmp_dir_norm = os.path.join(tmp_dir_path, 'normalized')
os.mkdir(tmp_dir_norm)
# extraxt
# "gracefully" handle input file access (currently a network mount)
num_tries = 1
while True:
try:
tar = tarfile.open(tar_path)
tar.extractall(path=tmp_dir_gz)
break
except IOError as err:
print(('[{}] IO error when trying exract tar file: {}'
'').format(num_tries, err))
num_tries += 1
time.sleep(60)
containing_dir = os.listdir(tmp_dir_gz)[0]
containing_path = os.path.join(tmp_dir_gz,
containing_dir)
for gz_fn in os.listdir(containing_path):
num_files_total += 1
gz_path_tmp = os.path.join(containing_path, gz_fn)
if os.path.splitext(gz_fn)[-1] == '.pdf':
num_pdf_total += 1
os.remove(gz_path_tmp)
continue
gz_path_new = os.path.join(tmp_dir_gz, gz_fn)
shutil.move(gz_path_tmp, gz_path_new)
os.rmdir(containing_path)
# adjust in_dir
source_file_info = normalize(
tmp_dir_gz,
tmp_dir_norm,
write_logs=write_logs
)
parse(
tmp_dir_norm,
out_dir,
tar_fn,
source_file_info,
meta_db,
incremental=False,
write_logs=write_logs
)
with open(done_log_path, 'a') as f:
f.write('{}\n'.format(tar_fn))
print('{} files'.format(num_files_total))
print('{} PDFs'.format(num_pdf_total))
if __name__ == '__main__':
if len(sys.argv) not in [4, 5]:
print((
'usage: python3 prepare.py </path/to/in/dir> </path/to/out/dir> '
'</path/to/metadata.db> [<tar_fn_patt>]'
))
sys.exit()
in_dir = sys.argv[1]
out_dir_dir = sys.argv[2]
meta_db = sys.argv[3]
if len(sys.argv) == 5:
tar_fn_patt = sys.argv[4]
else:
tar_fn_patt = '.tar'
ret = prepare(in_dir, out_dir_dir, meta_db, tar_fn_patt, write_logs=True)