Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions src/crawlee/_utils/robots.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from logging import getLogger
from typing import TYPE_CHECKING

from protego import Protego
Expand All @@ -15,6 +16,9 @@
from crawlee.proxy_configuration import ProxyInfo


logger = getLogger(__name__)


class RobotsTxtFile:
def __init__(
self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
Expand Down Expand Up @@ -56,12 +60,20 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N
http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
"""
response = await http_client.send_request(url, proxy_info=proxy_info)
body = (
b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read()
)
try:
response = await http_client.send_request(url, proxy_info=proxy_info)

body = (
b'User-agent: *\nAllow: /'
if is_status_code_client_error(response.status_code)
else await response.read()
)
robots = Protego.parse(body.decode('utf-8'))

except Exception as e:
logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')

robots = Protego.parse(body.decode('utf-8'))
robots = Protego.parse('User-agent: *\nAllow: /')

return cls(url, robots, http_client=http_client, proxy_info=proxy_info)

Expand Down
36 changes: 35 additions & 1 deletion tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pytest

from crawlee import ConcurrencySettings, Glob, HttpHeaders, RequestTransformAction, SkippedReason
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.crawlers import BasicCrawlingContext, BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.storages import RequestQueue

if TYPE_CHECKING:
Expand Down Expand Up @@ -167,6 +167,40 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
}


async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:
"""Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
visit = mock.Mock()
fail = mock.Mock()
crawler = BeautifulSoupCrawler(
http_client=http_client,
respect_robots_txt_file=True,
max_request_retries=0,
)

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
visit(context.request.url)
await context.enqueue_links(strategy='all')

@crawler.failed_request_handler
async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
fail(context.request.url)

await crawler.run([str(server_url / 'problematic_links')])

visited = {call[0][0] for call in visit.call_args_list}
failed = {call[0][0] for call in fail.call_args_list}

# Email must be skipped
# https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}

# The budplaceholder.com does not exist.
assert failed == {
'https://budplaceholder.com/',
}


async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True)
skip = mock.Mock()
Expand Down
36 changes: 35 additions & 1 deletion tests/unit/crawlers/_parsel/test_parsel_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from yarl import URL

from crawlee._request import RequestOptions
from crawlee.crawlers import ParselCrawlingContext
from crawlee.crawlers import BasicCrawlingContext, ParselCrawlingContext
from crawlee.http_clients._base import HttpClient


Expand Down Expand Up @@ -261,6 +261,40 @@ async def request_handler(context: ParselCrawlingContext) -> None:
}


async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:
"""Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
visit = mock.Mock()
fail = mock.Mock()
crawler = ParselCrawler(
http_client=http_client,
respect_robots_txt_file=True,
max_request_retries=0,
)

@crawler.router.default_handler
async def request_handler(context: ParselCrawlingContext) -> None:
visit(context.request.url)
await context.enqueue_links(strategy='all')

@crawler.failed_request_handler
async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
fail(context.request.url)

await crawler.run([str(server_url / 'problematic_links')])

visited = {call[0][0] for call in visit.call_args_list}
failed = {call[0][0] for call in fail.call_args_list}

# Email must be skipped
# https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}

# The budplaceholder.com does not exist.
assert failed == {
'https://budplaceholder.com/',
}


async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True)
skip = mock.Mock()
Expand Down
35 changes: 34 additions & 1 deletion tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
from crawlee._request import RequestOptions
from crawlee._types import HttpMethod, HttpPayload
from crawlee.browsers._types import BrowserType
from crawlee.crawlers import PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
from crawlee.crawlers import BasicCrawlingContext, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext


@pytest.mark.parametrize(
Expand Down Expand Up @@ -671,6 +671,39 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
}


async def test_respect_robots_txt_with_problematic_links(server_url: URL) -> None:
"""Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
visit = mock.Mock()
fail = mock.Mock()
crawler = PlaywrightCrawler(
respect_robots_txt_file=True,
max_request_retries=0,
)

@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
visit(context.request.url)
await context.enqueue_links(strategy='all')

@crawler.failed_request_handler
async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
fail(context.request.url)

await crawler.run([str(server_url / 'problematic_links')])

visited = {call[0][0] for call in visit.call_args_list}
failed = {call[0][0] for call in fail.call_args_list}

# Email must be skipped
# https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}

# The budplaceholder.com does not exist.
assert failed == {
'https://budplaceholder.com/',
}


async def test_on_skipped_request(server_url: URL) -> None:
crawler = PlaywrightCrawler(respect_robots_txt_file=True)
skip = mock.Mock()
Expand Down
10 changes: 10 additions & 0 deletions tests/unit/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
GENERIC_RESPONSE,
HELLO_WORLD,
INCAPSULA,
PROBLEMATIC_LINKS,
ROBOTS_TXT,
SECONDARY_INDEX,
START_ENQUEUE,
Expand Down Expand Up @@ -102,6 +103,7 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:
'page_1': generic_response_endpoint,
'page_2': generic_response_endpoint,
'page_3': generic_response_endpoint,
'problematic_links': problematic_links_endpoint,
'set_cookies': set_cookies,
'set_complex_cookies': set_complex_cookies,
'cookies': get_cookies,
Expand Down Expand Up @@ -287,6 +289,14 @@ async def generic_response_endpoint(_scope: dict[str, Any], _receive: Receive, s
)


async def problematic_links_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle requests with a page containing problematic links."""
await send_html_response(
send,
PROBLEMATIC_LINKS,
)


async def redirect_to_url(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle requests that should redirect to a specified full URL."""
query_params = get_query_params(scope.get('query_string', b''))
Expand Down
10 changes: 10 additions & 0 deletions tests/unit/server_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@
</iframe>
</body></html>"""

PROBLEMATIC_LINKS = b"""\
<html><head>
<title>Hello</title>
</head>
<body>
<a href="https://budplaceholder.com/">Placeholder</a>
<a href="mailto:test@test.com">test@test.com</a>
<a href=https://avatars.githubusercontent.com/apify>Apify avatar/a>
</body></html>"""

GENERIC_RESPONSE = b"""\
<html><head>
<title>Hello</title>
Expand Down
Loading