Skip to content

Commit

Permalink
revisit lookup fix (possible fix for ukwa/ukwa-pywb#53) (#530)
Browse files Browse the repository at this point in the history
- if a revisit record has empty hash, don't attempt to lookup an original, simply use with empty payload
  • Loading branch information
ikreymer authored Jan 11, 2020
1 parent f0b9d5b commit fb8aa7c
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 15 deletions.
11 changes: 9 additions & 2 deletions pywb/warcserver/resource/resolvingloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
class ResolvingLoader(object):
MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded'

EMPTY_DIGEST = '3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ'

def __init__(self, path_resolvers, record_loader=None, no_record_parse=False):
self.path_resolvers = path_resolvers
self.record_loader = record_loader if record_loader is not None else BlockArcWarcRecordLoader()
Expand Down Expand Up @@ -163,6 +165,13 @@ def _load_different_url_payload(self, cdx, headers_record,
Raise exception if no matches found.
"""

digest = cdx.get('digest', '-')

# if the digest is the empty record digest, don't attempt to look up the payload record!
# the payload is simply empty, so use empty payload of existing record
if digest == self.EMPTY_DIGEST:
return headers_record

ref_target_uri = (headers_record.rec_headers.
get_header('WARC-Refers-To-Target-URI'))

Expand All @@ -180,8 +189,6 @@ def _load_different_url_payload(self, cdx, headers_record,
else:
ref_target_date = iso_date_to_timestamp(ref_target_date)

digest = cdx.get('digest', '-')

try:
orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
ref_target_date,
Expand Down
37 changes: 24 additions & 13 deletions tests/test_redirects.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def create_response_record(self, url, timestamp, text):
self.writer.write_record(rec)
return rec

def create_revisit_record(self, original, url, redirect_url, timestamp):
def create_revisit_record(self, url, timestamp, redirect_url, original_dt):
warc_headers = {}
warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp)

Expand All @@ -67,9 +67,9 @@ def create_revisit_record(self, original, url, redirect_url, timestamp):
http_headers = StatusAndHeaders('302 Temp Redirect', headers_list, protocol='HTTP/1.0')

rec = self.writer.create_revisit_record(url,
digest=original.rec_headers['WARC-Payload-Digest'],
digest='3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ',
refers_to_uri=url,
refers_to_date=original.rec_headers['WARC-Date'],
refers_to_date=original_dt,
warc_headers_dict=warc_headers,
http_headers=http_headers)

Expand All @@ -80,9 +80,12 @@ def test_init_1(self):
with open(filename, 'wb') as fh:
self.writer = WARCWriter(fh, gzip=True)

redirect = self.create_redirect_record('http://example.com/', 'https://example.com/', '201806026101112')
redirect = self.create_redirect_record('https://example.com/', 'https://www.example.com/', '201806026101112')
response = self.create_response_record('https://www.example.com/', '201806026101112', 'Some Text')
redirect = self.create_redirect_record('http://example.com/', 'https://example.com/', '20180626101112')
redirect = self.create_redirect_record('https://example.com/', 'https://www.example.com/', '20180626101112')
response = self.create_response_record('https://www.example.com/', '20180626101112', 'Some Text')

revisit = self.create_revisit_record('https://example.com/path', '20190626101112', 'https://example.com/abc', response.rec_headers['WARC-Date'])
revisit = self.create_revisit_record('https://www.example.com/', '20190626101112', 'https://www.example.com/', response.rec_headers['WARC-Date'])

wb_manager(['init', 'redir'])

Expand All @@ -91,7 +94,7 @@ def test_init_1(self):
assert os.path.isfile(os.path.join(self.root_dir, self.COLLS_DIR, 'redir', 'indexes', 'index.cdxj'))

def test_self_redir_1(self, fmod):
res = self.get('/redir/201806026101112{0}/https://example.com/', fmod)
res = self.get('/redir/20180626101112{0}/https://example.com/', fmod, status=200)

assert res.status_code == 200

Expand All @@ -102,31 +105,39 @@ def test_redir_init_slash(self):
with open(filename, 'wb') as fh:
self.writer = WARCWriter(fh, gzip=True)

response = self.create_response_record('https://www.example.com/sub/path/', '201806026101112', 'Sub Path Data')
response = self.create_response_record('https://www.example.com/sub/path/', '20180626101112', 'Sub Path Data')

response = self.create_response_record('https://www.example.com/sub/path/?foo=bar', '201806026101112', 'Sub Path Data Q')
response = self.create_response_record('https://www.example.com/sub/path/?foo=bar', '20180626101112', 'Sub Path Data Q')

wb_manager(['add', 'redir', filename])

def test_redir_slash(self, fmod):
res = self.get('/redir/201806026101112{0}/https://example.com/sub/path', fmod, status=307)
res = self.get('/redir/20180626101112{0}/https://example.com/sub/path', fmod, status=307)

assert res.headers['Location'].endswith('/redir/201806026101112{0}/https://example.com/sub/path/'.format(fmod))
assert res.headers['Location'].endswith('/redir/20180626101112{0}/https://example.com/sub/path/'.format(fmod))
res = res.follow()

assert res.status_code == 200

assert res.text == 'Sub Path Data'

def test_redir_slash_with_query(self, fmod):
res = self.get('/redir/201806026101112{0}/https://example.com/sub/path?foo=bar', fmod, status=307)
res = self.get('/redir/20180626101112{0}/https://example.com/sub/path?foo=bar', fmod, status=307)

assert res.headers['Location'].endswith('/redir/201806026101112{0}/https://example.com/sub/path/?foo=bar'.format(fmod))
assert res.headers['Location'].endswith('/redir/20180626101112{0}/https://example.com/sub/path/?foo=bar'.format(fmod))
res = res.follow()

assert res.status_code == 200

assert res.text == 'Sub Path Data Q'

def test_revisit_redirect_302(self, fmod):
res = self.get('/redir/20170626101112{0}/https://example.com/path', fmod, status=302)
assert res.headers['Location'].endswith('/redir/20170626101112{0}/https://example.com/abc'.format(fmod))
assert res.text == ''

def test_revisit_redirect_skip_self_redir(self, fmod):
res = self.get('/redir/20190626101112{0}/http://www.example.com/', fmod, status=200)
assert res.text == 'Some Text'


0 comments on commit fb8aa7c

Please sign in to comment.