Skip to content

Commit 9aa2ddb

Browse files
author
Mantas Briliauskas
committed
Add save_html_on_codes setting
1 parent 76b6b8d commit 9aa2ddb

File tree

3 files changed

+88
-15
lines changed

3 files changed

+88
-15
lines changed

README.rst

+5-1
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,13 @@ HTML storage downloader middleware supports such options:
5252

5353
* **gzip_output** (bool) - if True, HTML output will be stored in gzip format.
5454
Default is False.
55+
* **save_html_on_status** (list) - if not empty, sets list of response codes
56+
whitelisted for html saving. If list is empty or not provided, all response
57+
codes will be allowed for html saving.
5558

5659
Sample::
5760

5861
HTML_STORAGE = {
59-
'gzip_output': True
62+
'gzip_output': True,
63+
'save_html_on_status': [200, 202]
6064
}

scrapy_html_storage/__init__.py

+24-5
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ def __init__(self, settings):
1515
"""
1616
self.settings = settings.get('HTML_STORAGE', {})
1717
self.gzip_output = self.settings.get('gzip_output', False)
18+
self.save_html_on_codes = self.settings.get('save_html_on_codes', [])
1819

1920

2021
@classmethod
@@ -43,7 +44,7 @@ def process_response(self, request, response, spider):
4344
Returns:
4445
scrapy.http.response.Response: unmodified response object.
4546
"""
46-
if should_save_html(request):
47+
if self._should_save_html(request, response):
4748
self._save_html_to(spider.response_html_path(request), response.body)
4849

4950
return response
@@ -64,12 +65,30 @@ def _save_html_to(self, path, html_body):
6465
fs.write_to_file(path, html_body)
6566

6667

67-
def should_save_html(request):
68+
def _should_save_html(self, request, response):
69+
"""
70+
Args:
71+
request (scrapy.http.request.Request)
72+
response (scrapy.http.response.Response)
73+
74+
Returns:
75+
bool: True if this request should be stored to disk, False otherwise.
76+
"""
77+
return 'save_html' in request.meta and \
78+
should_save_html_according_response_code(
79+
response.status,
80+
self.save_html_on_codes
81+
)
82+
83+
84+
def should_save_html_according_response_code(code, allowed_list):
6885
"""
6986
Args:
70-
request (scrapy.http.request.Request)
87+
code (int): response status code
88+
allowed_list (list): list of response status codes allowed to save html
7189
7290
Returns:
73-
bool: True if this request should be stored to disk, False otherwise.
91+
bool: True if allowed_list is empty (save all responses), or response
92+
code in allowed list.
7493
"""
75-
return 'save_html' in request.meta
94+
return not allowed_list or code in allowed_list

tests/test_html_storage_middleware.py

+59-9
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
from hamcrest import assert_that, is_
1+
from hamcrest import assert_that, is_, has_properties
22
from mock import MagicMock, patch, ANY
33
import pytest
44

55
from scrapy.settings import Settings
66

7-
from scrapy_html_storage import HtmlStorageMiddleware, should_save_html
7+
from scrapy_html_storage import HtmlStorageMiddleware
88

99

1010
def make_request_mock(save_html=False, query='', results_page=None):
@@ -19,11 +19,51 @@ def make_request_mock(save_html=False, query='', results_page=None):
1919

2020
return request_mock
2121

22+
def make_response_mock(response_status):
23+
""" Constructs HTTP Response mock object.
24+
"""
25+
response_mock = MagicMock()
26+
response_mock.status = response_status
27+
28+
return response_mock
29+
30+
31+
def make_allowed_response_codes_list():
32+
return range(200, 300)
33+
34+
35+
def make_downloader(save_html_on_codes=[]):
36+
settings = Settings()
37+
settings.set('HTML_STORAGE', {
38+
'gzip_output': True,
39+
'save_html_on_codes': save_html_on_codes
40+
})
41+
return HtmlStorageMiddleware(settings)
42+
43+
44+
@pytest.mark.parametrize('response_status,as_expected', [
45+
(200, True),
46+
(299, True),
47+
(300, False),
48+
(404, False),
49+
])
50+
def test_should_save_html_returns_true_when_request_metainformation_has_special_key_set_and_appropriate_response_status(response_status, as_expected):
51+
request_mock = make_request_mock(save_html=True)
52+
response_mock = make_response_mock(response_status=response_status)
53+
downloader = make_downloader(make_allowed_response_codes_list())
54+
55+
save = downloader._should_save_html(request_mock, response_mock)
56+
57+
assert_that(save, is_(as_expected))
58+
2259

23-
def test_should_save_html_returns_true_when_request_metainformation_has_special_key_set():
60+
@pytest.mark.parametrize('response_status', [200, 299, 300, 404])
61+
def test_should_save_html_returns_true_when_request_metainformation_has_special_key_set_and_allowed_resonse_codes_list_is_empty(response_status):
2462
request_mock = make_request_mock(save_html=True)
63+
response_mock = make_response_mock(response_status=response_status)
64+
downloader = make_downloader()
2565

26-
save = should_save_html(request_mock)
66+
save = downloader._should_save_html(request_mock, response_mock)
2767

2868
assert_that(save, is_(True))
2969

@@ -33,8 +73,9 @@ def test_process_response_stores_response_body_to_file_if_request_asks_for_it(
3373
write_to_file_mock):
3474
downloader = HtmlStorageMiddleware(Settings())
3575
request_mock = make_request_mock(save_html=True)
76+
response_mock = make_response_mock(response_status=200)
3677

37-
downloader.process_response(request_mock, MagicMock(), MagicMock())
78+
downloader.process_response(request_mock, response_mock, MagicMock())
3879

3980
assert_that(write_to_file_mock.call_count, is_(1))
4081

@@ -44,11 +85,12 @@ def test_process_response_saves_response_html_to_file_resolved_by_spider(
4485
write_to_file_mock):
4586
downloader = HtmlStorageMiddleware(Settings())
4687
request_mock = make_request_mock(save_html=True)
88+
response_mock = make_response_mock(response_status=200)
4789

4890
spider_mock = MagicMock()
4991
spider_mock.response_html_path.return_value = '/tmp/response.html'
5092

51-
downloader.process_response(request_mock, MagicMock(), spider_mock)
93+
downloader.process_response(request_mock, response_mock, spider_mock)
5294

5395
write_to_file_mock.assert_called_with('/tmp/response.html', ANY)
5496

@@ -59,8 +101,9 @@ def test_process_response_stores_response_body_to_gzip_file_if_this_setting_is_o
59101
downloader = HtmlStorageMiddleware(Settings())
60102
downloader.gzip_output = True
61103
request_mock = make_request_mock(save_html=True)
104+
response_mock = make_response_mock(response_status=200)
62105

63-
downloader.process_response(request_mock, MagicMock(), MagicMock())
106+
downloader.process_response(request_mock, response_mock, MagicMock())
64107

65108
assert_that(write_to_gzip_mock.call_count, is_(1))
66109

@@ -76,11 +119,18 @@ def test_from_settings_constructs_middleware_with_the_specified_settings():
76119

77120
def test_constructor_extracts_expected_settings():
78121
settings = Settings()
79-
settings.set('HTML_STORAGE', {'gzip_output': True})
122+
save_html_on_codes = make_allowed_response_codes_list()
123+
settings.set('HTML_STORAGE', {
124+
'gzip_output': True,
125+
'save_html_on_codes': save_html_on_codes
126+
})
80127

81128
downloader = HtmlStorageMiddleware(settings)
82129

83-
assert_that(downloader.gzip_output, is_(True))
130+
assert_that(downloader, has_properties(dict(
131+
gzip_output=True,
132+
save_html_on_codes=save_html_on_codes
133+
)))
84134

85135

86136
def test_constructor_sets_empty_settings_when_middleware_settings_are_not_specified():

0 commit comments

Comments
 (0)