diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py index f45fe9e604..67583c90eb 100644 --- a/src/crawlee/_utils/robots.py +++ b/src/crawlee/_utils/robots.py @@ -1,5 +1,6 @@ from __future__ import annotations +from logging import getLogger from typing import TYPE_CHECKING from protego import Protego @@ -15,6 +16,9 @@ from crawlee.proxy_configuration import ProxyInfo +logger = getLogger(__name__) + + class RobotsTxtFile: def __init__( self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None @@ -56,12 +60,20 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file. proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used. """ - response = await http_client.send_request(url, proxy_info=proxy_info) - body = ( - b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read() - ) + try: + response = await http_client.send_request(url, proxy_info=proxy_info) + + body = ( + b'User-agent: *\nAllow: /' + if is_status_code_client_error(response.status_code) + else await response.read() + ) + robots = Protego.parse(body.decode('utf-8')) + + except Exception as e: + logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"') - robots = Protego.parse(body.decode('utf-8')) + robots = Protego.parse('User-agent: *\nAllow: /') return cls(url, robots, http_client=http_client, proxy_info=proxy_info) diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index 5856990403..6b79eef895 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -6,7 +6,7 @@ import pytest from crawlee import ConcurrencySettings, Glob, HttpHeaders, RequestTransformAction, SkippedReason -from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.crawlers import BasicCrawlingContext, BeautifulSoupCrawler, BeautifulSoupCrawlingContext from crawlee.storages import RequestQueue if TYPE_CHECKING: @@ -167,6 +167,40 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: } +async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None: + """Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt.""" + visit = mock.Mock() + fail = mock.Mock() + crawler = BeautifulSoupCrawler( + http_client=http_client, + respect_robots_txt_file=True, + max_request_retries=0, + ) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + visit(context.request.url) + await context.enqueue_links(strategy='all') + + @crawler.failed_request_handler + async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None: + fail(context.request.url) + + await crawler.run([str(server_url / 'problematic_links')]) + + visited = {call[0][0] for call in visit.call_args_list} + failed = {call[0][0] for call in fail.call_args_list} + + # Email must be skipped + # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler. + assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'} + + # The budplaceholder.com does not exist. + assert failed == { + 'https://budplaceholder.com/', + } + + async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None: crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True) skip = mock.Mock() diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index 3e2fd89612..923894bc0c 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -14,7 +14,7 @@ from yarl import URL from crawlee._request import RequestOptions - from crawlee.crawlers import ParselCrawlingContext + from crawlee.crawlers import BasicCrawlingContext, ParselCrawlingContext from crawlee.http_clients._base import HttpClient @@ -261,6 +261,40 @@ async def request_handler(context: ParselCrawlingContext) -> None: } +async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None: + """Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt.""" + visit = mock.Mock() + fail = mock.Mock() + crawler = ParselCrawler( + http_client=http_client, + respect_robots_txt_file=True, + max_request_retries=0, + ) + + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + visit(context.request.url) + await context.enqueue_links(strategy='all') + + @crawler.failed_request_handler + async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None: + fail(context.request.url) + + await crawler.run([str(server_url / 'problematic_links')]) + + visited = {call[0][0] for call in visit.call_args_list} + failed = {call[0][0] for call in fail.call_args_list} + + # Email must be skipped + # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler. + assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'} + + # The budplaceholder.com does not exist. + assert failed == { + 'https://budplaceholder.com/', + } + + async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None: crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True) skip = mock.Mock() diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index 178e3830c3..c7346ef20f 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -48,7 +48,7 @@ from crawlee._request import RequestOptions from crawlee._types import HttpMethod, HttpPayload from crawlee.browsers._types import BrowserType - from crawlee.crawlers import PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext + from crawlee.crawlers import BasicCrawlingContext, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext @pytest.mark.parametrize( @@ -671,6 +671,39 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: } +async def test_respect_robots_txt_with_problematic_links(server_url: URL) -> None: + """Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt.""" + visit = mock.Mock() + fail = mock.Mock() + crawler = PlaywrightCrawler( + respect_robots_txt_file=True, + max_request_retries=0, + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + visit(context.request.url) + await context.enqueue_links(strategy='all') + + @crawler.failed_request_handler + async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None: + fail(context.request.url) + + await crawler.run([str(server_url / 'problematic_links')]) + + visited = {call[0][0] for call in visit.call_args_list} + failed = {call[0][0] for call in fail.call_args_list} + + # Email must be skipped + # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler. + assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'} + + # The budplaceholder.com does not exist. + assert failed == { + 'https://budplaceholder.com/', + } + + async def test_on_skipped_request(server_url: URL) -> None: crawler = PlaywrightCrawler(respect_robots_txt_file=True) skip = mock.Mock() diff --git a/tests/unit/server.py b/tests/unit/server.py index 16af127b1e..320e781ef1 100644 --- a/tests/unit/server.py +++ b/tests/unit/server.py @@ -18,6 +18,7 @@ GENERIC_RESPONSE, HELLO_WORLD, INCAPSULA, + PROBLEMATIC_LINKS, ROBOTS_TXT, SECONDARY_INDEX, START_ENQUEUE, @@ -102,6 +103,7 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None: 'page_1': generic_response_endpoint, 'page_2': generic_response_endpoint, 'page_3': generic_response_endpoint, + 'problematic_links': problematic_links_endpoint, 'set_cookies': set_cookies, 'set_complex_cookies': set_complex_cookies, 'cookies': get_cookies, @@ -287,6 +289,14 @@ async def generic_response_endpoint(_scope: dict[str, Any], _receive: Receive, s ) +async def problematic_links_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: + """Handle requests with a page containing problematic links.""" + await send_html_response( + send, + PROBLEMATIC_LINKS, + ) + + async def redirect_to_url(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests that should redirect to a specified full URL.""" query_params = get_query_params(scope.get('query_string', b'')) diff --git a/tests/unit/server_endpoints.py b/tests/unit/server_endpoints.py index 6070e046d0..a42062d114 100644 --- a/tests/unit/server_endpoints.py +++ b/tests/unit/server_endpoints.py @@ -35,6 +35,16 @@ """ +PROBLEMATIC_LINKS = b"""\ + + Hello + + + Placeholder + test@test.com + Apify avatar/a> +""" + GENERIC_RESPONSE = b"""\ Hello