-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimport.py
executable file
·331 lines (291 loc) · 11.1 KB
/
import.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
#!/usr/bin/env python
"""Import data to Open Annotation store over RESTful interface.
Example usage:
python import.py data/examples/craft/12925238.jsonld
(requires a RESTful Open Annotation server)
"""
__author__ = 'Sampo Pyysalo'
__license__ = 'MIT'
import os
import sys
import logging
import urlparse
import codecs
import json
from os import path
from logging import warn, info
import requests
# logging.basicConfig(level=logging.INFO)
TARGET_KEY = 'target'
DEFAULT_ANN_URL='http://127.0.0.1:5005/annotations/'
DEFAULT_DOC_URL='http://127.0.0.1:5005/documents/'
DEFAULT_ENCODING='utf-8'
def argparser():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('source', metavar='FILE/DIR', nargs='+',
help='Source data to import')
parser.add_argument('-v', '--verbose', default=False, action='store_true',
help='Verbose output')
parser.add_argument('-u', '--url', default=DEFAULT_ANN_URL,
help='URL for annotation store (default %s)' %
DEFAULT_ANN_URL)
parser.add_argument('-d', '--docurl', default=DEFAULT_DOC_URL,
help='URL for document store (default %s)' %
DEFAULT_DOC_URL)
parser.add_argument('-q', '--quiet', default=False, action='store_true',
help='No output')
return parser
def pretty(doc):
"""Pretty-print JSON."""
return json.dumps(doc, sort_keys=True, indent=2, separators=(',', ': '))
def read_json_file(source):
with codecs.open(source, encoding=DEFAULT_ENCODING) as f:
text = f.read()
return json.loads(text)
def read_text_file(filename, directory=None):
if directory is not None:
filename = path.join(directory, filename)
try:
with codecs.open(filename, encoding=DEFAULT_ENCODING) as f:
return f.read()
except Exception, e:
warn('failed to read %s: %s' % (filename, str(e)))
return None
def pretty_response_text(response):
try:
return pretty(response.json())
except ValueError:
return response.text
def process_response(document, response, options):
"""Report on response and return True on success, False on error."""
try:
response.raise_for_status()
if options is not None and options.verbose:
print response.status_code
print pretty_response_text(response)
return True
except requests.exceptions.HTTPError as error:
if options is None or not options.quiet:
print error.message
print pretty(document)
print pretty_response_text(response)
return False
def is_relative(url):
return urlparse.urlparse(url).netloc == ''
def get_relative_target_urls(document):
"""Return unique relative target URLs in OA JSON-LD document."""
# TODO: check for @base to differentiate true relative targets
# from ones that just look relative without context.
found = set()
target = document.get(TARGET_KEY)
if not target:
warn('missing target')
elif isinstance(target, basestring):
if is_relative(target):
found.add(urlparse.urldefrag(target)[0])
elif isinstance(target, list):
for t in target:
if is_relative(t):
found.add(urlparse.urldefrag(t)[0])
else:
raise NotImplementedError('structured target support')
return found
def _map_relative(target, target_map):
# Helper for rewrite_relative_target_urls
assert isinstance(target, basestring)
if not is_relative(target):
return target
base, frag = urlparse.urldefrag(target)
mapped = target_map.get(base)
if not mapped:
return target
else:
return mapped + '#' + frag
def rewrite_relative_target_urls(document, target_map):
"""Replace relative target URLs with absolute equivalents."""
target = document.get(TARGET_KEY)
if not target:
return
elif isinstance(target, basestring):
mapped = _map_relative(target, target_map)
elif isinstance(target, list):
mapped = [_map_relative(t, target_map) for t in target]
else:
raise NotImplementedError('structured target support')
document[TARGET_KEY] = mapped
def remove_non_files(filenames, basedir):
"""Return given list without non-files."""
filtered = set()
for filename in filenames:
pathname = path.join(basedir, filename)
if not os.path.exists(pathname):
warn('target not found: %s' % pathname)
elif not os.path.isfile(pathname):
warn('target not file: %s' % pathname)
else:
filtered.add(filename)
return filtered
def remove_known_targets(targets, target_text, target_map, options):
# TODO: use HEAD and ETag to avoid actual download
filtered = set()
headers = { 'Accept': 'text/plain' }
for target in targets:
url = urlparse.urljoin(options.docurl, target)
response = requests.get(url, headers=headers)
if response.status_code == 404:
info('not found: %s' % url)
filtered.add(target)
elif response.status_code == 200:
if response.text == target_text[target]:
# Document exists with identical text; OK.
info('document already in store: %s' % target)
target_map[target] = url
else:
# Document exists with different text; warn and block mapping.
warn('text mismatch for %s vs %s' % (target, url))
target_map[target] = None
else:
response.raise_for_status()
return filtered
def post_target(target, target_text, target_map, store):
"""Post given target document to store."""
headers = {'Content-type': 'application/json'}
content = target_text[target]
if content is None:
target_map[target] = None
return False
doc = {
'name': target,
'text': content
}
response = requests.post(store, data=json.dumps(doc), headers=headers)
try:
response.raise_for_status()
target_map[target] = urlparse.urljoin(store, target)
print 'POSTed %s to store %s' % (target, store)
return True
except Exception, e:
warn('error posting %s: %s' % (target, str(e)))
target_map[target] = None
return False
def post_target_documents(targets, target_map, basedir, options):
"""Post new targets to document store."""
# Filter out known and unavailable targets
targets = set(t for t in targets if t not in target_map)
targets = remove_non_files(targets, basedir)
if not targets:
return
# Make sure we have a store to post to
if not options.docurl:
warn('no docurl given, cannot POST target(s): %s' % ' '.join(targets))
return
# Read target document texts
target_text = {}
for target in targets:
target_text[target] = read_text_file(target, basedir)
# Exclude documents already in the store
targets = remove_known_targets(targets, target_text, target_map, options)
if not targets:
return
# Post each target to the store
for target in targets:
post_target(target, target_text, target_map, options.docurl)
def resolve_target_references(document, basedir, options):
"""Resolve relative target URLs, uploading documents if required."""
target_map = resolve_target_references.target_map
relative = get_relative_target_urls(document)
# POST any new targets to document store, updating map
post_target_documents(relative, target_map, basedir, options)
# rewrite relative URLs using the mapping
rewrite_relative_target_urls(document, target_map)
resolve_target_references.target_map = {}
def prepare_document_for_POST(document):
"""Make any modifications that may be necessary to POST document to a
RESTful Open Annotation store."""
# TODO: reconsider whether to allow @id in POSTed documents.
if '@id' in document:
del document['@id']
return document
def select_files(directory):
"""Return list of file or directory names that can be imported."""
assert path.isdir(directory)
for filename in os.listdir(directory):
pathname = path.join(directory, filename)
if path.isdir(pathname) or pathname.endswith('.jsonld'):
yield pathname
def import_from_dir(directory, options):
"""Import data from directory.
Return tuple of (success, failure) counts.
"""
success, failure = 0, 0
for name in select_files(directory):
subcount = import_from(name, options)
success += subcount[0]
failure += subcount[1]
return (success, failure)
def import_from_file(source, options):
"""Import data from file.
Return tuple of (success, failure) counts.
"""
count = {
True: 0,
False: 0,
}
headers = {'Content-type': 'application/json'}
try:
data = read_json_file(source)
except Exception, e:
print 'Failed to load json from %s: %s' % (source, str(e))
return (0, 1)
dir = os.path.dirname(source)
for doc in data['@graph']:
resolve_target_references(doc, dir, options)
doc = prepare_document_for_POST(doc)
rep = requests.post(options.url, data=json.dumps(doc), headers=headers)
status = process_response(doc, rep, options)
count[status] += 1
return (count[True], count[False])
def import_from(source, options):
"""Import data from file or directory source.
Return tuple of (success, failure) counts.
"""
if path.isdir(source):
return import_from_dir(source, options)
else:
return import_from_file(source, options)
def fix_args(args):
"""Fix potentially problematic user-provided arguments."""
# Note: urlparse gives unexpected results when given an
# incomplete url with a port and a path but no scheme:
# >>> urlparse.urlparse('example.org:80/foo').scheme
# 'example.org'
# We're avoiding this issue by prepending a default scheme
# if there's no obvious one present.
def has_scheme(u):
return u.startswith('http://') or u.startswith('https://')
# We're going to be urljoin:ing things to the docurl collection,
# so docurl has to end in a slash.
if args.docurl and not args.docurl.endswith('/'):
warn('adding "/" to docurl %s' % args.docurl)
args.docurl += '/'
if not has_scheme(args.url):
warn('adding "http://" to url %s' % args.url)
args.url = 'http://' + args.url
if not has_scheme(args.docurl):
warn('adding "http://" to docurl %s' % args.docurl)
args.docurl = 'http://' + args.docurl
return args
def main(argv):
args = fix_args(argparser().parse_args(argv[1:]))
if args.verbose and args.quiet:
argparser().print_help()
print 'error: both --verbose and --quiet specified.'
return 1
for s in args.source:
success, failure = import_from(s, args)
if not args.quiet:
print '%s: %d succeeded, %d failed' % (s, success, failure)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv))