diff --git a/docs/examples/code_examples/fill_and_submit_web_form_crawler.py b/docs/examples/code_examples/fill_and_submit_web_form_crawler.py
index c00a784411..0545c66680 100644
--- a/docs/examples/code_examples/fill_and_submit_web_form_crawler.py
+++ b/docs/examples/code_examples/fill_and_submit_web_form_crawler.py
@@ -12,7 +12,7 @@ async def main() -> None:
@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
- response = context.http_response.read().decode('utf-8')
+ response = (await context.http_response.read()).decode('utf-8')
context.log.info(f'Response: {response}') # To see the response in the logs.
# Prepare a POST request to the form endpoint.
diff --git a/docs/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py b/docs/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py
index 5e1da16bf6..58e5cfed2a 100644
--- a/docs/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py
+++ b/docs/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py
@@ -8,10 +8,10 @@
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
-def archive_response(context: ParselCrawlingContext, writer: WARCWriter) -> None:
+async def archive_response(context: ParselCrawlingContext, writer: WARCWriter) -> None:
"""Helper function for archiving response in WARC format."""
# Create WARC records for response
- response_body = context.http_response.read()
+ response_body = await context.http_response.read()
response_payload_stream = io.BytesIO(response_body)
response_headers = StatusAndHeaders(
@@ -51,7 +51,7 @@ async def main() -> None:
@crawler.router.default_handler
async def request_handler(context: ParselCrawlingContext) -> None:
context.log.info(f'Archiving {context.request.url} ...')
- archive_response(context=context, writer=writer)
+ await archive_response(context=context, writer=writer)
await context.enqueue_links(strategy='same-domain')
await crawler.run(['https://crawlee.dev/'])
diff --git a/docs/guides/code_examples/error_handling/change_handle_error_status.py b/docs/guides/code_examples/error_handling/change_handle_error_status.py
index 3b721545b2..4b69a54007 100644
--- a/docs/guides/code_examples/error_handling/change_handle_error_status.py
+++ b/docs/guides/code_examples/error_handling/change_handle_error_status.py
@@ -30,7 +30,7 @@ async def default_handler(context: HttpCrawlingContext) -> None:
response = await context.send_request(
'https://placeholder.org/refresh', headers=headers
)
- data = json.loads(response.read())
+ data = json.loads(await response.read())
# Add the new token to our `Request` headers
new_headers = {
**context.request.headers,
diff --git a/docs/guides/code_examples/login_crawler/http_login.py b/docs/guides/code_examples/login_crawler/http_login.py
index 5da5781045..2b7cb6050a 100644
--- a/docs/guides/code_examples/login_crawler/http_login.py
+++ b/docs/guides/code_examples/login_crawler/http_login.py
@@ -46,7 +46,7 @@ async def login_handler(context: HttpCrawlingContext) -> None:
raise RuntimeError('Session not found')
# Parse the API response containing authentication tokens and user data
- data = json.loads(context.http_response.read())
+ data = json.loads(await context.http_response.read())
# Extract authentication data from the response
token = data['token']
diff --git a/docs/guides/code_examples/session_management/sm_basic.py b/docs/guides/code_examples/session_management/sm_basic.py
index 958ad5a665..30e1d7ae92 100644
--- a/docs/guides/code_examples/session_management/sm_basic.py
+++ b/docs/guides/code_examples/session_management/sm_basic.py
@@ -30,7 +30,7 @@ async def default_handler(context: BasicCrawlingContext) -> None:
# and `context.proxy_info`.
response = await context.send_request(context.request.url)
- page_content = response.read().decode()
+ page_content = (await response.read()).decode()
title_match = re.search(r'
(.*?)', page_content)
if context.session and (title := title_match.group(1) if title_match else None):
diff --git a/docs/guides/code_examples/session_management/sm_http.py b/docs/guides/code_examples/session_management/sm_http.py
index cd12d04bdf..9497594d3b 100644
--- a/docs/guides/code_examples/session_management/sm_http.py
+++ b/docs/guides/code_examples/session_management/sm_http.py
@@ -26,7 +26,7 @@ async def main() -> None:
# based on the response content and potential blocking
@crawler.router.default_handler
async def default_handler(context: HttpCrawlingContext) -> None:
- page_content = context.http_response.read().decode()
+ page_content = (await context.http_response.read()).decode()
title_match = re.search(r'(.*?)', page_content)
if context.session and (title := title_match.group(1) if title_match else None):
diff --git a/docs/upgrading/upgrading_to_v1.md b/docs/upgrading/upgrading_to_v1.md
index 6e5cc9df2c..4897a7b956 100644
--- a/docs/upgrading/upgrading_to_v1.md
+++ b/docs/upgrading/upgrading_to_v1.md
@@ -121,6 +121,8 @@ dataset = await Dataset.open(
The `persist_storage` and `persist_metadata` fields have been removed from the `Configuration` class.
Persistence is now determined solely by the storage client class you use.
+The `read` method for `HttpResponse` has been changed from synchronous to asynchronous.
+
### Storage client instance behavior
Instance caching is implemented for the storage open methods: `Dataset.open()`, `KeyValueStore.open()`,
diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
index 5cbe59b5ed..f45fe9e604 100644
--- a/src/crawlee/_utils/robots.py
+++ b/src/crawlee/_utils/robots.py
@@ -57,7 +57,9 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N
proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
"""
response = await http_client.send_request(url, proxy_info=proxy_info)
- body = b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else response.read()
+ body = (
+ b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read()
+ )
robots = Protego.parse(body.decode('utf-8'))
diff --git a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py
index fb5d6802f9..5692e924a0 100644
--- a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py
+++ b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py
@@ -26,7 +26,7 @@ def from_basic_crawling_context(cls, context: BasicCrawlingContext, http_respons
async def get_snapshot(self) -> PageSnapshot:
"""Get snapshot of crawled page."""
- return PageSnapshot(html=self.http_response.read().decode('utf-8'))
+ return PageSnapshot(html=(await self.http_response.read()).decode('utf-8'))
@dataclass(frozen=True)
diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py
index f0543b9a8e..11d3374a93 100644
--- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py
+++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py
@@ -21,7 +21,7 @@ def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None:
@override
async def parse(self, response: HttpResponse) -> BeautifulSoup:
- return BeautifulSoup(response.read(), features=self._parser)
+ return BeautifulSoup(await response.read(), features=self._parser)
@override
async def parse_text(self, text: str) -> BeautifulSoup:
diff --git a/src/crawlee/crawlers/_http/_http_crawler.py b/src/crawlee/crawlers/_http/_http_crawler.py
index f81f68fd40..e8034b9c35 100644
--- a/src/crawlee/crawlers/_http/_http_crawler.py
+++ b/src/crawlee/crawlers/_http/_http_crawler.py
@@ -36,7 +36,7 @@ async def request_handler(context: HttpCrawlingContext) -> None:
# Extract data from the page.
data = {
'url': context.request.url,
- 'response': context.http_response.read().decode()[:100],
+ 'response': (await context.http_response.read()).decode()[:100],
}
# Push the extracted data to the default dataset.
diff --git a/src/crawlee/crawlers/_http/_http_parser.py b/src/crawlee/crawlers/_http/_http_parser.py
index 0a9af538dc..91c05aaee0 100644
--- a/src/crawlee/crawlers/_http/_http_parser.py
+++ b/src/crawlee/crawlers/_http/_http_parser.py
@@ -21,7 +21,7 @@ class NoParser(AbstractHttpParser[bytes, bytes]):
@override
async def parse(self, response: HttpResponse) -> bytes:
- return response.read()
+ return await response.read()
@override
async def parse_text(self, text: str) -> bytes:
diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py
index 9baa1eba7c..e1de883efc 100644
--- a/src/crawlee/crawlers/_parsel/_parsel_parser.py
+++ b/src/crawlee/crawlers/_parsel/_parsel_parser.py
@@ -19,7 +19,8 @@ class ParselParser(AbstractHttpParser[Selector, Selector]):
@override
async def parse(self, response: HttpResponse) -> Selector:
- return await asyncio.to_thread(lambda: Selector(body=response.read()))
+ response_body = await response.read()
+ return await asyncio.to_thread(lambda: Selector(body=response_body))
@override
async def parse_text(self, text: str) -> Selector:
diff --git a/src/crawlee/crawlers/_playwright/_types.py b/src/crawlee/crawlers/_playwright/_types.py
index 8d0000adab..17e6a20a46 100644
--- a/src/crawlee/crawlers/_playwright/_types.py
+++ b/src/crawlee/crawlers/_playwright/_types.py
@@ -41,7 +41,7 @@ class PlaywrightHttpResponse:
headers: HttpHeaders
_content: bytes
- def read(self) -> bytes:
+ async def read(self) -> bytes:
return self._content
async def read_stream(self) -> AsyncGenerator[bytes, None]:
diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py
index 5cb48759e3..36ddb6e2ca 100644
--- a/src/crawlee/http_clients/_base.py
+++ b/src/crawlee/http_clients/_base.py
@@ -35,7 +35,7 @@ def status_code(self) -> int:
def headers(self) -> HttpHeaders:
"""The HTTP headers received in the response."""
- def read(self) -> bytes:
+ async def read(self) -> bytes:
"""Read the entire content of the response body.
This method loads the complete response body into memory at once. It should be used
diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py
index aebd689c29..5a0e24799e 100644
--- a/src/crawlee/http_clients/_curl_impersonate.py
+++ b/src/crawlee/http_clients/_curl_impersonate.py
@@ -85,7 +85,7 @@ def status_code(self) -> int:
def headers(self) -> HttpHeaders:
return HttpHeaders({key: value for key, value in self._response.headers.items() if value})
- def read(self) -> bytes:
+ async def read(self) -> bytes:
if self._response.astream_task:
raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method')
return self._response.content
diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py
index ca14a3c627..c1ddddf6d7 100644
--- a/src/crawlee/http_clients/_httpx.py
+++ b/src/crawlee/http_clients/_httpx.py
@@ -46,10 +46,10 @@ def status_code(self) -> int:
def headers(self) -> HttpHeaders:
return HttpHeaders(dict(self._response.headers))
- def read(self) -> bytes:
+ async def read(self) -> bytes:
if not self._response.is_closed:
raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method')
- return self._response.read()
+ return await self._response.aread()
async def read_stream(self) -> AsyncIterator[bytes]:
if self._response.is_stream_consumed:
diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
index 4e8a513118..23297a30b5 100644
--- a/tests/unit/crawlers/_basic/test_basic_crawler.py
+++ b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -331,7 +331,7 @@ async def test_send_request_works(server_url: URL, method: HttpMethod, path: str
async def handler(context: BasicCrawlingContext) -> None:
response = await context.send_request(str(server_url / path), method=method, payload=payload)
- response_data['body'] = json.loads(response.read())
+ response_data['body'] = json.loads(await response.read())
response_data['headers'] = response.headers
await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py
index 9b83679565..2f973f32d7 100644
--- a/tests/unit/crawlers/_http/test_http_crawler.py
+++ b/tests/unit/crawlers/_http/test_http_crawler.py
@@ -240,7 +240,7 @@ async def test_sending_payload_as_raw_data(http_client: HttpClient, server_url:
@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
- response = json.loads(context.http_response.read())
+ response = json.loads(await context.http_response.read())
# The post endpoint returns the provided payload in the response.
responses.append(response)
@@ -271,7 +271,7 @@ async def test_sending_payload_as_form_data(http_client: HttpClient, server_url:
@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
- response = json.loads(context.http_response.read())
+ response = json.loads(await context.http_response.read())
# The /post endpoint returns the provided payload in the response.
responses.append(response)
@@ -297,7 +297,7 @@ async def test_sending_payload_as_json(http_client: HttpClient, server_url: URL)
@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
- response = json.loads(context.http_response.read())
+ response = json.loads(await context.http_response.read())
# The /post endpoint returns the provided payload in the response.
responses.append(response)
@@ -324,7 +324,7 @@ async def test_sending_url_query_params(http_client: HttpClient, server_url: URL
@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
- response = json.loads(context.http_response.read())
+ response = json.loads(await context.http_response.read())
# The /get endpoint returns the provided query parameters in the response.
responses.append(response)
@@ -397,7 +397,7 @@ async def handler(context: HttpCrawlingContext) -> None:
sessions_cookies[context.session.id] = {
cookie['name']: cookie['value'] for cookie in context.session.cookies.get_cookies_as_dicts()
}
- response_data = json.loads(context.http_response.read())
+ response_data = json.loads(await context.http_response.read())
response_cookies[context.session.id] = response_data.get('cookies')
if context.request.user_data.get('retire_session'):
diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py
index fdea9942ff..64fd96f8ef 100644
--- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py
+++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py
@@ -672,14 +672,14 @@ async def test_send_request(server_url: URL) -> None:
@crawler.pre_navigation_hook
async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:
send_request_response = await context.send_request(str(server_url / 'user-agent'))
- check_data['pre_send_request'] = dict(json.loads(send_request_response.read()))
+ check_data['pre_send_request'] = dict(json.loads(await send_request_response.read()))
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
response = await context.response.text()
check_data['default'] = dict(json.loads(response))
send_request_response = await context.send_request(str(server_url / 'user-agent'))
- check_data['send_request'] = dict(json.loads(send_request_response.read()))
+ check_data['send_request'] = dict(json.loads(await send_request_response.read()))
await crawler.run([str(server_url / 'user-agent')])
@@ -703,7 +703,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
response = await context.response.text()
check_data['default'] = dict(json.loads(response))
send_request_response = await context.send_request(str(server_url / 'user-agent'))
- check_data['send_request'] = dict(json.loads(send_request_response.read()))
+ check_data['send_request'] = dict(json.loads(await send_request_response.read()))
await crawler.run([str(server_url / 'user-agent')])
diff --git a/tests/unit/http_clients/test_curl_impersonate.py b/tests/unit/http_clients/test_curl_impersonate.py
index 77e79d474f..dc72500c77 100644
--- a/tests/unit/http_clients/test_curl_impersonate.py
+++ b/tests/unit/http_clients/test_curl_impersonate.py
@@ -162,7 +162,7 @@ async def test_stream_error_for_read(http_client: CurlImpersonateHttpClient, ser
assert response.status_code == 200
with pytest.raises(RuntimeError):
- response.read()
+ await response.read()
async def test_send_request_error_for_read_stream(http_client: CurlImpersonateHttpClient, server_url: URL) -> None:
diff --git a/tests/unit/http_clients/test_httpx.py b/tests/unit/http_clients/test_httpx.py
index 4bee7f9460..ba0571fd52 100644
--- a/tests/unit/http_clients/test_httpx.py
+++ b/tests/unit/http_clients/test_httpx.py
@@ -95,7 +95,7 @@ async def test_common_headers_and_user_agent(server_url: URL, header_network: di
client = HttpxHttpClient()
response = await client.send_request(str(server_url / 'headers'))
- response_headers = json.loads(response.read().decode())
+ response_headers = json.loads((await response.read()).decode())
assert 'accept' in response_headers
assert response_headers['accept'] in get_available_header_values(header_network, {'Accept', 'accept'})
@@ -176,7 +176,7 @@ async def test_stream_error_for_read(http_client: HttpxHttpClient, server_url: U
assert response.status_code == 200
with pytest.raises(RuntimeError):
- response.read()
+ await response.read()
async def test_send_request_error_for_read_stream(http_client: HttpxHttpClient, server_url: URL) -> None: