-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrefine-hyperlinks.py
executable file
·146 lines (141 loc) · 5.19 KB
/
refine-hyperlinks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/c/Users/vadim/AppData/Local/Programs/Python/Python35/python
# -*- coding: utf-8 -*-
#
# a module for cleaning up links
import sys, os, os.path
from fancy.ANSI import C
from lib.AST import Sleigh
from lib.JSON import parseJSON, json2lines
from lib.LP import listify
from lib.NLP import strictstrip
ienputdir = '../json'
n2f_name = '_name2file.json'
name2file = parseJSON(n2f_name) if os.path.exists(n2f_name) else {}
sleigh = Sleigh(ienputdir + '/corpus', name2file)
verbose = False
def checkon(fn, o):
if not os.path.exists(fn) or os.path.isdir(fn):
fn = fn + '.json'
f = open(fn, 'r', encoding='utf-8')
lines = f.readlines()[1:-1]
f.close()
flines = json2lines(lines)
plines = sorted(json2lines(o.getJSON().split('\n')))
# "url" from DBLP are useless
if 'url' in o.json.keys():
o.json['url'] = [link.replace('https://', 'http://')\
for link in listify(o.json['url'])\
if not link.startswith('db/conf/')\
and not link.startswith('db/series/')\
and not link.startswith('db/books/')\
and not link.startswith('db/journals/')]
if not o.json['url']:
del o.json['url']
elif len(o.json['url']) == 1:
o.json['url'] = o.json['url'][0]
if 'ee' in o.json.keys() and 'doi' not in o.json.keys():
if isinstance(o.json['ee'], list):
if verbose:
print(C.red('Manylink:'), o.json['ee'])
newee = []
for onelink in listify(o.json['ee']):
if onelink.startswith('http://dx.doi.org/'):
o.json['doi'] = onelink[18:]
elif onelink.startswith('http://doi.acm.org/'):
o.json['doi'] = onelink[19:]
elif onelink.startswith('http://doi.ieeecomputersociety.org/'):
o.json['doi'] = onelink[35:]
elif onelink.startswith('http://dl.acm.org/citation.cfm?id='):
o.json['acmid'] = onelink[34:]
elif onelink.startswith('http://portal.acm.org/citation.cfm?id='):
o.json['acmid'] = onelink[38:]
elif onelink.startswith('http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=')\
or onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber='):
o.json['ieeearid'] = onelink.split('=')[-1]
elif onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=')\
and onelink.find('arnumber') > -1:
o.json['ieeearid'] = onelink.split('arnumber=')[-1].split('&')[0]
elif onelink.startswith('http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber='):
o.json['ieeepuid'] = onelink.split('=')[-1]
elif onelink.startswith('http://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber='):
o.json['ieeeisid'] = onelink.split('=')[-1]
elif onelink.startswith('http://eceasst.cs.tu-berlin.de/index.php/eceasst/article/view/'):
newee.append('http://journal.ub.tu-berlin.de/eceasst/article/view/' + onelink.split('/')[-1])
elif onelink.endswith('.pdf') and \
(onelink.startswith('http://computer.org/proceedings/')\
or onelink.startswith('http://csdl.computer.org/')):
# Bad: http://computer.org/proceedings/icsm/1189/11890007.pdf
# Bad: http://csdl.computer.org/comp/proceedings/date/2003/1870/02/187020040.pdf
# Good: http://www.computer.org/csdl/proceedings/icsm/2001/1189/00/11890004.pdf
if onelink.startswith('http://csdl'):
cname, _, cid, mid, pid = onelink.split('/')[5:10]
else:
cname, cid, pid = onelink.split('/')[4:7]
# heuristic
if pid.startswith(cid):
mid = pid[len(cid):len(cid)+2]
else:
mid = '00'
newee.append('http://www.computer.org/csdl/proceedings/{}/{}/{}/{}/{}'.format(\
cname,
o.get('year'),
cid,
mid,
pid))
else:
if onelink.find('ieee') > -1:
print(C.purple('IEEE'), onelink)
if verbose:
print(C.yellow('Missed opportunity:'), onelink)
# nothing matches => preserve
newee.append(onelink)
if len(newee) == 0:
del o.json['ee']
elif len(newee) == 1:
o.json['ee'] = newee[0]
else:
o.json['ee'] = newee
# post-processing normalisation
if 'acmid' in o.json.keys() and not isinstance(o.json['acmid'], int) and o.json['acmid'].isdigit():
o.json['acmid'] = int(o.json['acmid'])
if 'eventuri' in o.json.keys():
o.json['eventurl'] = o.json['eventuri']
del o.json['eventuri']
if 'eventurl' in o.json.keys() and o.json['eventurl'].startswith('https://'):
o.json['eventurl'] = o.json['eventurl'].replace('https://', 'http://')
nlines = sorted(json2lines(o.getJSON().split('\n')))
if flines != plines:
return 1
elif plines != nlines:
f = open(fn, 'w', encoding='utf-8')
f.write(o.getJSON())
f.close()
return 2
else:
return 0
def checkreport(fn, o):
statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
r = checkon(fn, o)
# non-verbose mode by default
if verbose or r != 0:
print('[ {} ] {}'.format(statuses[r], fn))
return r
if __name__ == "__main__":
if len(sys.argv) > 1:
verbose = sys.argv[1] == '-v'
print('{}: {} venues, {} papers\n{}'.format(\
C.purple('BibSLEIGH'),
C.red(len(sleigh.venues)),
C.red(sleigh.numOfPapers()),
C.purple('='*42)))
cx = {0: 0, 1: 0, 2: 0}
for v in sleigh.venues:
for c in v.getConfs():
cx[checkreport(c.filename, c)] += 1
for p in c.papers:
cx[checkreport(p.filename, p)] += 1
print('{} files checked, {} ok, {} fixed, {} failed'.format(\
C.bold(cx[0] + cx[1] + cx[2]),
C.blue(cx[0]),
C.yellow(cx[2]),
C.red(cx[1])))