Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

revisit of redirect optimization: #753

Merged
merged 3 commits into from
Aug 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions pywb/warcserver/resource/resolvingloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ def load_headers_and_payload(self, cdx, failed_files, cdx_loader):
# two index lookups
# Case 1: if mimetype is still warc/revisit
if cdx.get('mime') == 'warc/revisit' and headers_record:
if headers_record.http_headers:
status = headers_record.http_headers.get_statuscode()
# optimization: if redirect, don't load payload record, as it'll be ignored by browser
# always replay zero-length payload
if status and status.startswith('3'):
headers_record.http_headers.replace_header('Content-Length', '0')
return headers_record, headers_record

payload_record = self._load_different_url_payload(cdx,
headers_record,
failed_files,
Expand Down
2 changes: 0 additions & 2 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,11 @@ def test_replay_content_head(self, fmod):
def test_replay_content_head_non_zero_content_length_match(self):
resp = self.testapp.get('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
length = resp.content_length
print('length', length)

# Content-Length included if non-zero
resp = self.testapp.head('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)

#assert resp.headers['Content-Length'] == length
print('length', resp.content_length)
assert resp.content_length == length

def test_replay_content(self, fmod):
Expand Down
13 changes: 8 additions & 5 deletions tests/test_redirect_revisits.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from .base_config_test import BaseConfigTest, CollsDirMixin, fmod

from io import BytesIO
import os

from warcio import WARCWriter, StatusAndHeaders
from pywb.manager.manager import main as wb_manager

from .base_config_test import BaseConfigTest, CollsDirMixin, fmod


# ============================================================================
class TestRevisits(CollsDirMixin, BaseConfigTest):
Expand Down Expand Up @@ -125,18 +124,22 @@ def test_different_url_revisit_orig_headers(self, fmod):
res = self.get('/revisits/20220101{0}/http://example.com/', fmod, status=301)
assert res.headers["Custom"] == "4"
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-4".format(fmod))
assert res.text == 'some\ntext'
assert res.content_length == 0
assert res.text == ''

def test_different_url_revisit_and_response(self, fmod):
def test_different_url_response_and_revisit(self, fmod):
# response
res = self.get('/revisits/20200101{0}/http://example.com/orig-2', fmod, status=301)
assert res.headers["Custom"] == "2"
assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-2".format(fmod))
assert res.text == 'some\ntext'

# revisit
res = self.get('/revisits/20220101{0}/http://example.com/orig-2', fmod, status=301)
assert res.headers["Custom"] == "3"
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-3".format(fmod))
assert res.text == 'some\ntext'
assert res.content_length == 0
assert res.text == ''

def test_orig(self, fmod):
res = self.get('/revisits/20200101{0}/http://example.com/orig-1', fmod, status=301)
Expand Down