-
Notifications
You must be signed in to change notification settings - Fork 19
/
normalize_arxiv_dump.py
253 lines (230 loc) · 9.34 KB
/
normalize_arxiv_dump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
""" Normalize a arXiv dump
- copy PDF files as is
- unzip gzipped single files
- copy if it's a LaTeX file
- extract gzipped tar archives
- try to flatten contents to a single LaTeX file
- ignores non LaTeX contents (HTML, PS, TeX, ...)
"""
import chardet
import gzip
import magic
import os
import re
import shutil
import subprocess
import sys
import tarfile
import tempfile
from hashlib import sha1
MAIN_TEX_PATT = re.compile(r'(\\begin\s*\{\s*document\s*\})', re.I)
# ^ with capturing parentheses so that the pattern can be used for splitting
PDF_EXT_PATT = re.compile(r'^\.pdf$', re.I)
GZ_EXT_PATT = re.compile(r'^\.gz$', re.I)
TEX_EXT_PATT = re.compile(r'^\.tex$', re.I)
NON_TEXT_PATT = re.compile(r'^\.(pdf|eps|jpg|png|gif)$', re.I)
BBL_SIGN = '\\bibitem'
# natbib fix
PRE_FIX_NATBIB = True
NATBIB_PATT = re.compile(
(r'\\cite(t|p|alt|alp|author|year|yearpar)\s*?\*?\s*?'
r'(\[[^\]]*?\]\s*?)*?\s*?\*?\s*?\{([^\}]+?)\}'),
re.I
)
# bibitem option fix
PRE_FIX_BIBOPT = True
BIBOPT_PATT = re.compile(r'\\bibitem\s*?\[[^]]*?\]', re.I | re.M)
# ↑ above two solve most tralics problems; except for mnras style bibitems
# (https://ctan.org/pkg/mnras)
# agressive math pre-removal
PRE_FILTER_MATH = False
FILTER_PATTS = []
for env in ['equation', 'displaymath', 'array', 'eqnarray', 'align', 'gather',
'multline', 'flalign', 'alignat']:
s = r'\\begin\{{{0}[*]?\}}.+?\\end\{{{0}\}}'.format(env)
patt = re.compile(s, re.I | re.M | re.S)
FILTER_PATTS.append(patt)
FILTER_PATTS.append(re.compile(r'\$\$.+?\$\$', re.S))
FILTER_PATTS.append(re.compile(r'\$.+?\$', re.S))
FILTER_PATTS.append(re.compile(r'\\\(.+?\\\)', re.S))
FILTER_PATTS.append(re.compile(r'\\\[.+?\\\]', re.S))
def read_file(path):
try:
with open(path) as f:
cntnt = f.read()
except UnicodeDecodeError:
blob = open(path, 'rb').read()
m = magic.Magic(mime_encoding=True)
encoding = m.from_buffer(blob)
try:
cntnt = blob.decode(encoding)
except (UnicodeDecodeError, LookupError) as e:
encoding = chardet.detect(blob)['encoding']
if encoding:
try:
cntnt = blob.decode(encoding, errors='replace')
except:
return ''
else:
return ''
return cntnt
def read_gzipped_file(path):
blob = gzip.open(path, 'rb').read()
m = magic.Magic(mime_encoding=True)
encoding = m.from_buffer(blob)
try:
cntnt = blob.decode(encoding)
except (UnicodeDecodeError, LookupError) as e:
encoding = chardet.detect(blob)['encoding']
if not encoding:
return False
cntnt = blob.decode(encoding, errors='replace')
return cntnt
def remove_math(latex_str):
parts = re.split(MAIN_TEX_PATT, latex_str, maxsplit=1)
for patt in FILTER_PATTS:
parts[2] = re.sub(patt, '', parts[2])
return ''.join(parts)
def _source_file_hash(fp):
source_file_hasher = sha1()
with open(fp, 'rb') as source_file:
buf = source_file.read()
source_file_hasher.update(buf)
source_file_hash = str(source_file_hasher.hexdigest())
return source_file_hash
def normalize(in_dir, out_dir, write_logs=True):
def log(msg):
if write_logs:
with open(os.path.join(out_dir, 'log.txt'), 'a') as f:
f.write('{}\n'.format(msg))
if not os.path.isdir(in_dir):
print('dump directory does not exist')
return False
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
source_file_info = dict()
for fn in os.listdir(in_dir):
path = os.path.join(in_dir, fn)
aid_fn_safe, ext = os.path.splitext(fn)
source_file_info[aid_fn_safe] = {
'name': fn,
'hash': _source_file_hash(path)
}
if PDF_EXT_PATT.match(ext):
# copy over pdf file as is
dest = os.path.join(out_dir, fn)
shutil.copyfile(path, dest)
elif GZ_EXT_PATT.match(ext):
if tarfile.is_tarfile(path):
with tempfile.TemporaryDirectory() as tmp_dir_path:
# extract archive contents
tar = tarfile.open(path)
fnames = tar.getnames()
tar.extractall(path=tmp_dir_path)
# identify main tex file
main_tex_path = None
ignored_names = []
# check .tex files first
for tfn in fnames:
if not TEX_EXT_PATT.match(os.path.splitext(tfn)[1]):
ignored_names.append(tfn)
continue
tmp_file_path = os.path.join(tmp_dir_path, tfn)
if os.path.isdir(tmp_file_path):
continue
try:
cntnt = read_file(tmp_file_path)
except:
continue
if re.search(MAIN_TEX_PATT, cntnt) is not None:
main_tex_path = tmp_file_path
# try other files
if main_tex_path is None:
for tfn in ignored_names:
tmp_file_path = os.path.join(tmp_dir_path, tfn)
if NON_TEXT_PATT.match(os.path.splitext(tfn)[1]):
continue
try:
cntnt = read_file(tmp_file_path)
if re.search(MAIN_TEX_PATT, cntnt) is not None:
main_tex_path = tmp_file_path
except:
continue
# give up
if main_tex_path is None:
log(('couldn\'t find main tex file in dump archive {}'
'').format(fn))
continue
# "identify" bbl file
# https://arxiv.org/help/submit_tex#bibtex
main_tex_fn = os.path.normpath(
main_tex_path).split(os.sep)[-1]
fn_base = os.path.splitext(main_tex_path)[0]
bbl_fn = '{}.bbl'.format(fn_base)
if os.path.isfile(os.path.join(tmp_dir_path, bbl_fn)):
latexpand_args = ['latexpand',
'--expand-bbl',
bbl_fn,
main_tex_fn]
else:
latexpand_args = ['latexpand',
main_tex_fn]
# flatten to single tex file and save
new_tex_fn = '{}.tex'.format(aid_fn_safe)
tmp_dest = os.path.join(tmp_dir_path, new_tex_fn)
out = open(tmp_dest, mode='w')
if write_logs:
err = open(
os.path.join(out_dir, 'log_latexpand.txt'), 'a'
)
else:
err = open(os.devnull, 'w')
err.write('\n------------- {} -------------\n'.format(aid_fn_safe))
err.flush()
subprocess.run(latexpand_args, stdout=out, stderr=err,
cwd=tmp_dir_path)
out.close()
err.close()
# re-read and write to ensure utf-8 b/c latexpand doesn't
# behave
cntnt = read_file(tmp_dest)
if PRE_FIX_NATBIB:
cntnt = NATBIB_PATT.sub(r'\\cite{\3}', cntnt)
if PRE_FIX_BIBOPT:
cntnt = BIBOPT_PATT.sub(r'\\bibitem', cntnt)
if PRE_FILTER_MATH:
cntnt = remove_math(cntnt)
dest = os.path.join(out_dir, new_tex_fn)
with open(dest, mode='w', encoding='utf-8') as f:
f.write(cntnt)
else:
# extraxt gzipped tex file
cntnt = read_gzipped_file(path)
if not cntnt:
continue
if re.search(MAIN_TEX_PATT, cntnt) is None:
log('unexpected content in dump archive {}'.format(fn))
continue
new_fn = '{}.tex'.format(aid_fn_safe)
if PRE_FIX_NATBIB:
cntnt = NATBIB_PATT.sub(r'\\cite{\3}', cntnt)
if PRE_FIX_BIBOPT:
cntnt = BIBOPT_PATT.sub('\\bibitem', cntnt)
if PRE_FILTER_MATH:
cntnt = remove_math(cntnt)
dest = os.path.join(out_dir, new_fn)
with open(dest, mode='w', encoding='utf-8') as f:
f.write(cntnt)
else:
log('unexpected file {} in dump directory'.format(fn))
return source_file_info
if __name__ == '__main__':
if len(sys.argv) != 3:
print(('usage: python3 nomalize_arxiv_dump.py </path/to/dump/dir> </pa'
'th/to/out/dir>'))
sys.exit()
in_dir = sys.argv[1]
out_dir = sys.argv[2]
ret = normalize(in_dir, out_dir)
if not ret:
sys.exit()