diff --git a/docs/examples/code_examples/fill_and_submit_web_form_crawler.py b/docs/examples/code_examples/fill_and_submit_web_form_crawler.py index c00a784411..0545c66680 100644 --- a/docs/examples/code_examples/fill_and_submit_web_form_crawler.py +++ b/docs/examples/code_examples/fill_and_submit_web_form_crawler.py @@ -12,7 +12,7 @@ async def main() -> None: @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') - response = context.http_response.read().decode('utf-8') + response = (await context.http_response.read()).decode('utf-8') context.log.info(f'Response: {response}') # To see the response in the logs. # Prepare a POST request to the form endpoint. diff --git a/docs/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py b/docs/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py index 5e1da16bf6..58e5cfed2a 100644 --- a/docs/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py +++ b/docs/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py @@ -8,10 +8,10 @@ from crawlee.crawlers import ParselCrawler, ParselCrawlingContext -def archive_response(context: ParselCrawlingContext, writer: WARCWriter) -> None: +async def archive_response(context: ParselCrawlingContext, writer: WARCWriter) -> None: """Helper function for archiving response in WARC format.""" # Create WARC records for response - response_body = context.http_response.read() + response_body = await context.http_response.read() response_payload_stream = io.BytesIO(response_body) response_headers = StatusAndHeaders( @@ -51,7 +51,7 @@ async def main() -> None: @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Archiving {context.request.url} ...') - archive_response(context=context, writer=writer) + await archive_response(context=context, writer=writer) await context.enqueue_links(strategy='same-domain') await crawler.run(['https://crawlee.dev/']) diff --git a/docs/guides/code_examples/error_handling/change_handle_error_status.py b/docs/guides/code_examples/error_handling/change_handle_error_status.py index 3b721545b2..4b69a54007 100644 --- a/docs/guides/code_examples/error_handling/change_handle_error_status.py +++ b/docs/guides/code_examples/error_handling/change_handle_error_status.py @@ -30,7 +30,7 @@ async def default_handler(context: HttpCrawlingContext) -> None: response = await context.send_request( 'https://placeholder.org/refresh', headers=headers ) - data = json.loads(response.read()) + data = json.loads(await response.read()) # Add the new token to our `Request` headers new_headers = { **context.request.headers, diff --git a/docs/guides/code_examples/login_crawler/http_login.py b/docs/guides/code_examples/login_crawler/http_login.py index 5da5781045..2b7cb6050a 100644 --- a/docs/guides/code_examples/login_crawler/http_login.py +++ b/docs/guides/code_examples/login_crawler/http_login.py @@ -46,7 +46,7 @@ async def login_handler(context: HttpCrawlingContext) -> None: raise RuntimeError('Session not found') # Parse the API response containing authentication tokens and user data - data = json.loads(context.http_response.read()) + data = json.loads(await context.http_response.read()) # Extract authentication data from the response token = data['token'] diff --git a/docs/guides/code_examples/session_management/sm_basic.py b/docs/guides/code_examples/session_management/sm_basic.py index 958ad5a665..30e1d7ae92 100644 --- a/docs/guides/code_examples/session_management/sm_basic.py +++ b/docs/guides/code_examples/session_management/sm_basic.py @@ -30,7 +30,7 @@ async def default_handler(context: BasicCrawlingContext) -> None: # and `context.proxy_info`. response = await context.send_request(context.request.url) - page_content = response.read().decode() + page_content = (await response.read()).decode() title_match = re.search(r'(.*?)', page_content) if context.session and (title := title_match.group(1) if title_match else None): diff --git a/docs/guides/code_examples/session_management/sm_http.py b/docs/guides/code_examples/session_management/sm_http.py index cd12d04bdf..9497594d3b 100644 --- a/docs/guides/code_examples/session_management/sm_http.py +++ b/docs/guides/code_examples/session_management/sm_http.py @@ -26,7 +26,7 @@ async def main() -> None: # based on the response content and potential blocking @crawler.router.default_handler async def default_handler(context: HttpCrawlingContext) -> None: - page_content = context.http_response.read().decode() + page_content = (await context.http_response.read()).decode() title_match = re.search(r'(.*?)', page_content) if context.session and (title := title_match.group(1) if title_match else None): diff --git a/docs/upgrading/upgrading_to_v1.md b/docs/upgrading/upgrading_to_v1.md index 6e5cc9df2c..4897a7b956 100644 --- a/docs/upgrading/upgrading_to_v1.md +++ b/docs/upgrading/upgrading_to_v1.md @@ -121,6 +121,8 @@ dataset = await Dataset.open( The `persist_storage` and `persist_metadata` fields have been removed from the `Configuration` class. Persistence is now determined solely by the storage client class you use. +The `read` method for `HttpResponse` has been changed from synchronous to asynchronous. + ### Storage client instance behavior Instance caching is implemented for the storage open methods: `Dataset.open()`, `KeyValueStore.open()`, diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py index 5cbe59b5ed..f45fe9e604 100644 --- a/src/crawlee/_utils/robots.py +++ b/src/crawlee/_utils/robots.py @@ -57,7 +57,9 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used. """ response = await http_client.send_request(url, proxy_info=proxy_info) - body = b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else response.read() + body = ( + b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read() + ) robots = Protego.parse(body.decode('utf-8')) diff --git a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py index fb5d6802f9..5692e924a0 100644 --- a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py +++ b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py @@ -26,7 +26,7 @@ def from_basic_crawling_context(cls, context: BasicCrawlingContext, http_respons async def get_snapshot(self) -> PageSnapshot: """Get snapshot of crawled page.""" - return PageSnapshot(html=self.http_response.read().decode('utf-8')) + return PageSnapshot(html=(await self.http_response.read()).decode('utf-8')) @dataclass(frozen=True) diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py index f0543b9a8e..11d3374a93 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py @@ -21,7 +21,7 @@ def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None: @override async def parse(self, response: HttpResponse) -> BeautifulSoup: - return BeautifulSoup(response.read(), features=self._parser) + return BeautifulSoup(await response.read(), features=self._parser) @override async def parse_text(self, text: str) -> BeautifulSoup: diff --git a/src/crawlee/crawlers/_http/_http_crawler.py b/src/crawlee/crawlers/_http/_http_crawler.py index f81f68fd40..e8034b9c35 100644 --- a/src/crawlee/crawlers/_http/_http_crawler.py +++ b/src/crawlee/crawlers/_http/_http_crawler.py @@ -36,7 +36,7 @@ async def request_handler(context: HttpCrawlingContext) -> None: # Extract data from the page. data = { 'url': context.request.url, - 'response': context.http_response.read().decode()[:100], + 'response': (await context.http_response.read()).decode()[:100], } # Push the extracted data to the default dataset. diff --git a/src/crawlee/crawlers/_http/_http_parser.py b/src/crawlee/crawlers/_http/_http_parser.py index 0a9af538dc..91c05aaee0 100644 --- a/src/crawlee/crawlers/_http/_http_parser.py +++ b/src/crawlee/crawlers/_http/_http_parser.py @@ -21,7 +21,7 @@ class NoParser(AbstractHttpParser[bytes, bytes]): @override async def parse(self, response: HttpResponse) -> bytes: - return response.read() + return await response.read() @override async def parse_text(self, text: str) -> bytes: diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py index 9baa1eba7c..e1de883efc 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_parser.py +++ b/src/crawlee/crawlers/_parsel/_parsel_parser.py @@ -19,7 +19,8 @@ class ParselParser(AbstractHttpParser[Selector, Selector]): @override async def parse(self, response: HttpResponse) -> Selector: - return await asyncio.to_thread(lambda: Selector(body=response.read())) + response_body = await response.read() + return await asyncio.to_thread(lambda: Selector(body=response_body)) @override async def parse_text(self, text: str) -> Selector: diff --git a/src/crawlee/crawlers/_playwright/_types.py b/src/crawlee/crawlers/_playwright/_types.py index 8d0000adab..17e6a20a46 100644 --- a/src/crawlee/crawlers/_playwright/_types.py +++ b/src/crawlee/crawlers/_playwright/_types.py @@ -41,7 +41,7 @@ class PlaywrightHttpResponse: headers: HttpHeaders _content: bytes - def read(self) -> bytes: + async def read(self) -> bytes: return self._content async def read_stream(self) -> AsyncGenerator[bytes, None]: diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index 5cb48759e3..36ddb6e2ca 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -35,7 +35,7 @@ def status_code(self) -> int: def headers(self) -> HttpHeaders: """The HTTP headers received in the response.""" - def read(self) -> bytes: + async def read(self) -> bytes: """Read the entire content of the response body. This method loads the complete response body into memory at once. It should be used diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py index aebd689c29..5a0e24799e 100644 --- a/src/crawlee/http_clients/_curl_impersonate.py +++ b/src/crawlee/http_clients/_curl_impersonate.py @@ -85,7 +85,7 @@ def status_code(self) -> int: def headers(self) -> HttpHeaders: return HttpHeaders({key: value for key, value in self._response.headers.items() if value}) - def read(self) -> bytes: + async def read(self) -> bytes: if self._response.astream_task: raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method') return self._response.content diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py index ca14a3c627..c1ddddf6d7 100644 --- a/src/crawlee/http_clients/_httpx.py +++ b/src/crawlee/http_clients/_httpx.py @@ -46,10 +46,10 @@ def status_code(self) -> int: def headers(self) -> HttpHeaders: return HttpHeaders(dict(self._response.headers)) - def read(self) -> bytes: + async def read(self) -> bytes: if not self._response.is_closed: raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method') - return self._response.read() + return await self._response.aread() async def read_stream(self) -> AsyncIterator[bytes]: if self._response.is_stream_consumed: diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 4e8a513118..23297a30b5 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -331,7 +331,7 @@ async def test_send_request_works(server_url: URL, method: HttpMethod, path: str async def handler(context: BasicCrawlingContext) -> None: response = await context.send_request(str(server_url / path), method=method, payload=payload) - response_data['body'] = json.loads(response.read()) + response_data['body'] = json.loads(await response.read()) response_data['headers'] = response.headers await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py index 9b83679565..2f973f32d7 100644 --- a/tests/unit/crawlers/_http/test_http_crawler.py +++ b/tests/unit/crawlers/_http/test_http_crawler.py @@ -240,7 +240,7 @@ async def test_sending_payload_as_raw_data(http_client: HttpClient, server_url: @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: - response = json.loads(context.http_response.read()) + response = json.loads(await context.http_response.read()) # The post endpoint returns the provided payload in the response. responses.append(response) @@ -271,7 +271,7 @@ async def test_sending_payload_as_form_data(http_client: HttpClient, server_url: @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: - response = json.loads(context.http_response.read()) + response = json.loads(await context.http_response.read()) # The /post endpoint returns the provided payload in the response. responses.append(response) @@ -297,7 +297,7 @@ async def test_sending_payload_as_json(http_client: HttpClient, server_url: URL) @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: - response = json.loads(context.http_response.read()) + response = json.loads(await context.http_response.read()) # The /post endpoint returns the provided payload in the response. responses.append(response) @@ -324,7 +324,7 @@ async def test_sending_url_query_params(http_client: HttpClient, server_url: URL @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: - response = json.loads(context.http_response.read()) + response = json.loads(await context.http_response.read()) # The /get endpoint returns the provided query parameters in the response. responses.append(response) @@ -397,7 +397,7 @@ async def handler(context: HttpCrawlingContext) -> None: sessions_cookies[context.session.id] = { cookie['name']: cookie['value'] for cookie in context.session.cookies.get_cookies_as_dicts() } - response_data = json.loads(context.http_response.read()) + response_data = json.loads(await context.http_response.read()) response_cookies[context.session.id] = response_data.get('cookies') if context.request.user_data.get('retire_session'): diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index fdea9942ff..64fd96f8ef 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -672,14 +672,14 @@ async def test_send_request(server_url: URL) -> None: @crawler.pre_navigation_hook async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None: send_request_response = await context.send_request(str(server_url / 'user-agent')) - check_data['pre_send_request'] = dict(json.loads(send_request_response.read())) + check_data['pre_send_request'] = dict(json.loads(await send_request_response.read())) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: response = await context.response.text() check_data['default'] = dict(json.loads(response)) send_request_response = await context.send_request(str(server_url / 'user-agent')) - check_data['send_request'] = dict(json.loads(send_request_response.read())) + check_data['send_request'] = dict(json.loads(await send_request_response.read())) await crawler.run([str(server_url / 'user-agent')]) @@ -703,7 +703,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: response = await context.response.text() check_data['default'] = dict(json.loads(response)) send_request_response = await context.send_request(str(server_url / 'user-agent')) - check_data['send_request'] = dict(json.loads(send_request_response.read())) + check_data['send_request'] = dict(json.loads(await send_request_response.read())) await crawler.run([str(server_url / 'user-agent')]) diff --git a/tests/unit/http_clients/test_curl_impersonate.py b/tests/unit/http_clients/test_curl_impersonate.py index 77e79d474f..dc72500c77 100644 --- a/tests/unit/http_clients/test_curl_impersonate.py +++ b/tests/unit/http_clients/test_curl_impersonate.py @@ -162,7 +162,7 @@ async def test_stream_error_for_read(http_client: CurlImpersonateHttpClient, ser assert response.status_code == 200 with pytest.raises(RuntimeError): - response.read() + await response.read() async def test_send_request_error_for_read_stream(http_client: CurlImpersonateHttpClient, server_url: URL) -> None: diff --git a/tests/unit/http_clients/test_httpx.py b/tests/unit/http_clients/test_httpx.py index 4bee7f9460..ba0571fd52 100644 --- a/tests/unit/http_clients/test_httpx.py +++ b/tests/unit/http_clients/test_httpx.py @@ -95,7 +95,7 @@ async def test_common_headers_and_user_agent(server_url: URL, header_network: di client = HttpxHttpClient() response = await client.send_request(str(server_url / 'headers')) - response_headers = json.loads(response.read().decode()) + response_headers = json.loads((await response.read()).decode()) assert 'accept' in response_headers assert response_headers['accept'] in get_available_header_values(header_network, {'Accept', 'accept'}) @@ -176,7 +176,7 @@ async def test_stream_error_for_read(http_client: HttpxHttpClient, server_url: U assert response.status_code == 200 with pytest.raises(RuntimeError): - response.read() + await response.read() async def test_send_request_error_for_read_stream(http_client: HttpxHttpClient, server_url: URL) -> None: