diff --git a/docs/examples/code_examples/respect_robots_txt_file.py b/docs/examples/code_examples/respect_robots_txt_file.py new file mode 100644 index 0000000000..ebd63b1c2e --- /dev/null +++ b/docs/examples/code_examples/respect_robots_txt_file.py @@ -0,0 +1,27 @@ +import asyncio + +from crawlee.crawlers import ( + BeautifulSoupCrawler, + BeautifulSoupCrawlingContext, +) + + +async def main() -> None: + # Initialize the crawler with robots.txt compliance enabled + crawler = BeautifulSoupCrawler(respect_robots_txt_file=True) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Start the crawler with the specified URLs + # The crawler will check the robots.txt file before making requests + # In this example, 'https://news.ycombinator.com/login' will be skipped + # because it's disallowed in the site's robots.txt file + await crawler.run( + ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login'] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/examples/respect_robots_txt_file.mdx b/docs/examples/respect_robots_txt_file.mdx new file mode 100644 index 0000000000..5f6194c919 --- /dev/null +++ b/docs/examples/respect_robots_txt_file.mdx @@ -0,0 +1,21 @@ +--- +id: respect-robots-txt-file +title: Respect robots.txt file +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import RespectRobotsTxt from '!!raw-loader!roa-loader!./code_examples/respect_robots_txt_file.py'; + +This example demonstrates how to configure your crawler to respect the rules established by websites for crawlers as described in the [robots.txt](https://www.robotstxt.org/robotstxt.html) file. + +To configure `Crawlee` to follow the `robots.txt` file, set the parameter `respect_robots_txt_file=True` in <ApiLink to="class/BasicCrawlerOptions">`BasicCrawlerOptions`</ApiLink>. In this case, `Crawlee` will skip any URLs forbidden in the website's robots.txt file. + +As an example, let's look at the website `https://news.ycombinator.com/` and its corresponding [robots.txt](https://news.ycombinator.com/robots.txt) file. Since the file has a rule `Disallow: /login`, the URL `https://news.ycombinator.com/login` will be automatically skipped. + +The code below demonstrates this behavior using the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>: + +<RunnableCodeBlock className="language-python" language="python"> + {RespectRobotsTxt} +</RunnableCodeBlock> diff --git a/pyproject.toml b/pyproject.toml index 0c6c4596c0..77b1fe2d1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ dependencies = [ "eval-type-backport>=0.2.0", "httpx[brotli,http2,zstd]>=0.27.0", "more-itertools>=10.2.0", + "protego>=0.4.0", "psutil>=6.0.0", "pydantic-settings>=2.2.0,<2.7.0", "pydantic>=2.8.0,!=2.10.0,!=2.10.1,!=2.10.2", @@ -236,7 +237,9 @@ module = [ "functions_framework", # Example code shows deploy on Google Cloud. "jaro", # Untyped and stubs not available "loguru", # Example code shows integration of loguru and crawlee for JSON logging. + "protego", # Untyped and stubs not available "sklearn.linear_model", # Untyped and stubs not available + "sortedcollections", # Untyped and stubs not available "cookiecutter.*", # Untyped and stubs not available "inquirer.*", # Untyped and stubs not available ] diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py new file mode 100644 index 0000000000..930ae09431 --- /dev/null +++ b/src/crawlee/_utils/robots.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from protego import Protego +from yarl import URL + +from crawlee._utils.web import is_status_code_client_error + +if TYPE_CHECKING: + from typing_extensions import Self + + from crawlee.http_clients import HttpClient + from crawlee.proxy_configuration import ProxyInfo + + +class RobotsTxtFile: + def __init__(self, url: str, robots: Protego) -> None: + self._robots = robots + self._original_url = URL(url).origin() + + @classmethod + async def from_content(cls, url: str, content: str) -> Self: + """Create a `RobotsTxtFile` instance from the given content. + + Args: + url: The URL associated with the robots.txt file. + content: The raw string content of the robots.txt file to be parsed. + """ + robots = Protego.parse(content) + return cls(url, robots) + + @classmethod + async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self: + """Determine the location of a robots.txt file for a URL and fetch it. + + Args: + url: The URL whose domain will be used to find the corresponding robots.txt file. + http_client: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used. + proxy_info: The `HttpClient` instance used to perform the network request for fetching the robots.txt file. + """ + robots_url = URL(url).with_path('/robots.txt') + return await cls.load(str(robots_url), http_client, proxy_info) + + @classmethod + async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self: + """Load the robots.txt file for a given URL. + + Args: + url: The direct URL of the robots.txt file to be loaded. + http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file. + proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used. + """ + response = await http_client.send_request(url, proxy_info=proxy_info) + body = b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else response.read() + + robots = Protego.parse(body.decode('utf-8')) + + return cls(url, robots) + + def is_allowed(self, url: str, user_agent: str = '*') -> bool: + """Check if the given URL is allowed for the given user agent. + + Args: + url: The URL to check against the robots.txt rules. + user_agent: The user-agent string to check permissions for. Defaults to '*' which matches any user-agent. + """ + check_url = URL(url) + if check_url.origin() != self._original_url: + return True + return bool(self._robots.can_fetch(str(check_url), user_agent)) + + def get_sitemaps(self) -> list[str]: + """Get the list of sitemaps urls from the robots.txt file.""" + return list(self._robots.sitemaps) + + def get_crawl_delay(self, user_agent: str = '*') -> int | None: + """Get the crawl delay for the given user agent. + + Args: + user_agent: The user-agent string to check the crawl delay for. Defaults to '*' which matches any + user-agent. + """ + crawl_delay = self._robots.crawl_delay(user_agent) + return int(crawl_delay) if crawl_delay is not None else None diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 04e16683f6..9abcb4c6f5 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -159,12 +159,19 @@ async def extract_links( requests = list[Request]() base_user_data = user_data or {} + robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url) + for link in self._parser.find_links(parsed_content, selector=selector): url = link if not is_url_absolute(url): base_url = context.request.loaded_url or context.request.url url = convert_to_absolute_url(base_url, url) + if robots_txt_file and not robots_txt_file.is_allowed(url): + # TODO: https://github.com/apify/crawlee-python/issues/1160 + # add processing with on_skipped_request hook + continue + request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label) if transform_request_function: diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index c69db280a6..49b28c043e 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -17,8 +17,10 @@ from urllib.parse import ParseResult, urlparse from weakref import WeakKeyDictionary +from cachetools import LRUCache from tldextract import TLDExtract from typing_extensions import NotRequired, TypedDict, TypeVar, Unpack, assert_never +from yarl import URL from crawlee import EnqueueStrategy, Glob, service_locator from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus @@ -32,6 +34,7 @@ SendRequestFunction, ) from crawlee._utils.docs import docs_group +from crawlee._utils.robots import RobotsTxtFile from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute from crawlee._utils.wait import wait_for from crawlee._utils.web import is_status_code_client_error, is_status_code_server_error @@ -158,6 +161,10 @@ class _BasicCrawlerOptions(TypedDict): """A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.""" + respect_robots_txt_file: NotRequired[bool] + """If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain, + and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`.""" + class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict): """Generic options the `BasicCrawler` constructor.""" @@ -238,6 +245,7 @@ def __init__( keep_alive: bool = False, configure_logging: bool = True, statistics_log_format: Literal['table', 'inline'] = 'table', + respect_robots_txt_file: bool = False, _context_pipeline: ContextPipeline[TCrawlingContext] | None = None, _additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None, _logger: logging.Logger | None = None, @@ -280,6 +288,9 @@ def __init__( configure_logging: If True, the crawler will set up logging infrastructure automatically. statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain text log messages. + respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file + for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added + via `EnqueueLinksFunction` _context_pipeline: Enables extending the request lifecycle and modifying the crawling context. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`. _additional_context_managers: Additional context managers used throughout the crawler lifecycle. @@ -335,6 +346,7 @@ def __init__( self._max_requests_per_crawl = max_requests_per_crawl self._max_session_rotations = max_session_rotations self._max_crawl_depth = max_crawl_depth + self._respect_robots_txt_file = respect_robots_txt_file # Timeouts self._request_handler_timeout = request_handler_timeout @@ -371,6 +383,8 @@ def __init__( self._additional_context_managers = _additional_context_managers or [] # Internal, not explicitly configurable components + self._robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000) + self._robots_txt_lock = asyncio.Lock() self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name) self._snapshotter = Snapshotter.from_config(config) self._autoscaled_pool = AutoscaledPool( @@ -645,10 +659,25 @@ async def add_requests( wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning. wait_for_all_requests_to_be_added_timeout: Timeout for waiting for all requests to be added. """ + allowed_requests = [] + skipped = [] + + for request in requests: + check_url = request.url if isinstance(request, Request) else request + if await self._is_allowed_based_on_robots_txt_file(check_url): + allowed_requests.append(request) + else: + skipped.append(request) + + if skipped: + # TODO: https://github.com/apify/crawlee-python/issues/1160 + # add processing with on_skipped_request hook + self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file') + request_manager = await self.get_request_manager() await request_manager.add_requests_batched( - requests=requests, + requests=allowed_requests, batch_size=batch_size, wait_time_between_batches=wait_time_between_batches, wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added, @@ -1080,6 +1109,22 @@ async def __run_task_function(self) -> None: if request is None: return + if not (await self._is_allowed_based_on_robots_txt_file(request.url)): + self._logger.warning( + f'Skipping request {request.url} ({request.id}) because it is disallowed based on robots.txt' + ) + await wait_for( + lambda: request_manager.mark_request_as_handled(request), + timeout=self._internal_timeout, + timeout_message='Marking request as handled timed out after ' + f'{self._internal_timeout.total_seconds()} seconds', + logger=self._logger, + max_retries=3, + ) + # TODO: https://github.com/apify/crawlee-python/issues/1160 + # add processing with on_skipped_request hook + return + if request.session_id: session = await self._get_session_by_id(request.session_id) else: @@ -1263,3 +1308,38 @@ def _check_request_collision(self, request: Request, session: Session | None) -> raise RequestCollisionError( f'The Session (id: {request.session_id}) bound to the Request is no longer available in SessionPool' ) + + async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool: + """Check if the URL is allowed based on the robots.txt file. + + Args: + url: The URL to verify against robots.txt rules. Returns True if crawling this URL is permitted. + """ + if not self._respect_robots_txt_file: + return True + robots_txt_file = await self._get_robots_txt_file_for_url(url) + return not robots_txt_file or robots_txt_file.is_allowed(url) + + async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None: + """Get the RobotsTxtFile for a given URL. + + Args: + url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file. + """ + if not self._respect_robots_txt_file: + return None + origin_url = str(URL(url).origin()) + robots_txt_file = self._robots_txt_file_cache.get(origin_url) + if robots_txt_file: + return robots_txt_file + + async with self._robots_txt_lock: + # Check again if the robots.txt file is already cached after acquiring the lock + robots_txt_file = self._robots_txt_file_cache.get(origin_url) + if robots_txt_file: + return robots_txt_file + + # If not cached, fetch the robots.txt file + robots_txt_file = await RobotsTxtFile.find(url, self._http_client) + self._robots_txt_file_cache[origin_url] = robots_txt_file + return robots_txt_file diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index f923229087..8981498906 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -290,6 +290,8 @@ async def extract_links( elements = await context.page.query_selector_all(selector) + robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url) + for element in elements: url = await element.get_attribute('href') @@ -300,6 +302,11 @@ async def extract_links( base_url = context.request.loaded_url or context.request.url url = convert_to_absolute_url(base_url, url) + if robots_txt_file and not robots_txt_file.is_allowed(url): + # TODO: https://github.com/apify/crawlee-python/issues/1160 + # add processing with on_skipped_request hook + continue + request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label}) if transform_request_function: diff --git a/src/crawlee/storage_clients/_memory/_request_queue_client.py b/src/crawlee/storage_clients/_memory/_request_queue_client.py index 0031e54abd..477d53df07 100644 --- a/src/crawlee/storage_clients/_memory/_request_queue_client.py +++ b/src/crawlee/storage_clients/_memory/_request_queue_client.py @@ -8,7 +8,7 @@ from logging import getLogger from typing import TYPE_CHECKING -from sortedcollections import ValueSortedDict # type: ignore[import-untyped] +from sortedcollections import ValueSortedDict from typing_extensions import override from crawlee._types import StorageTypes diff --git a/tests/unit/_utils/test_robots.py b/tests/unit/_utils/test_robots.py new file mode 100644 index 0000000000..61dc60daa5 --- /dev/null +++ b/tests/unit/_utils/test_robots.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from crawlee._utils.robots import RobotsTxtFile + +if TYPE_CHECKING: + from yarl import URL + + from crawlee.http_clients._base import HttpClient + + +async def test_generation_robots_txt_url(server_url: URL, http_client: HttpClient) -> None: + robots_file = await RobotsTxtFile.find(str(server_url), http_client) + assert len(robots_file.get_sitemaps()) > 0 + + +async def test_allow_disallow_robots_txt(server_url: URL, http_client: HttpClient) -> None: + robots = await RobotsTxtFile.find(str(server_url), http_client) + assert robots.is_allowed('https://crawlee.dev') + assert robots.is_allowed(str(server_url / 'something/page.html')) + assert robots.is_allowed(str(server_url / 'deny_googlebot/page.html')) + assert not robots.is_allowed(str(server_url / 'deny_all/page.html')) + + +async def test_extract_sitemaps_urls(server_url: URL, http_client: HttpClient) -> None: + robots = await RobotsTxtFile.find(str(server_url), http_client) + assert len(robots.get_sitemaps()) == 2 + assert set(robots.get_sitemaps()) == {'http://not-exists.com/sitemap_1.xml', 'http://not-exists.com/sitemap_2.xml'} + + +async def test_parse_from_content() -> None: + content = """User-agent: * + Disallow: *deny_all/ + crawl-delay: 10 + User-agent: Googlebot + Disallow: *deny_googlebot/""" + robots = await RobotsTxtFile.from_content('http://not-exists.com/robots.txt', content) + assert robots.is_allowed('http://not-exists.com/something/page.html') + assert robots.is_allowed('http://not-exists.com/deny_googlebot/page.html') + assert not robots.is_allowed('http://not-exists.com/deny_googlebot/page.html', 'Googlebot') + assert not robots.is_allowed('http://not-exists.com/deny_all/page.html') + + +async def test_bind_robots_txt_url() -> None: + content = 'User-agent: *\nDisallow: /' + robots = await RobotsTxtFile.from_content('http://check.com/robots.txt', content) + assert not robots.is_allowed('http://check.com/test.html') + assert robots.is_allowed('http://othercheck.com/robots.txt') diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index ab7a219ef7..40f57de5ea 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -12,13 +12,14 @@ from datetime import timedelta from pathlib import Path from typing import TYPE_CHECKING, Any, Literal, cast -from unittest.mock import AsyncMock, Mock, call +from unittest.mock import AsyncMock, Mock, call, patch import pytest from crawlee import ConcurrencySettings, Glob, service_locator from crawlee._request import Request from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpHeaders +from crawlee._utils.robots import RobotsTxtFile from crawlee.configuration import Configuration from crawlee.crawlers import BasicCrawler from crawlee.errors import RequestCollisionError, SessionError, UserDefinedErrorHandlerError @@ -1310,3 +1311,15 @@ async def failed_request_handler(context: BasicCrawlingContext, error: Exception await crawler.run(requests) assert set(requests) == handler_requests + + +async def test_lock_with_get_robots_txt_file_for_url(server_url: URL) -> None: + crawler = BasicCrawler(respect_robots_txt_file=True) + + with patch('crawlee.crawlers._basic._basic_crawler.RobotsTxtFile.find', wraps=RobotsTxtFile.find) as spy: + await asyncio.gather( + *[asyncio.create_task(crawler._get_robots_txt_file_for_url(str(server_url))) for _ in range(10)] + ) + + # Check that the lock was acquired only once + assert spy.call_count == 1 diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index 167391dc6f..b73ea4aeaa 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -142,3 +142,21 @@ async def test_handle_blocked_request(server_url: URL, http_client: HttpClient) def test_default_logger() -> None: assert BeautifulSoupCrawler().log.name == 'BeautifulSoupCrawler' + + +async def test_respect_robots_txt(server_url: URL, http_client: HttpClient) -> None: + crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True) + visit = mock.Mock() + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + visit(context.request.url) + await context.enqueue_links() + + await crawler.run([str(server_url / 'start_enqueue')]) + visited = {call[0][0] for call in visit.call_args_list} + + assert visited == { + str(server_url / 'start_enqueue'), + str(server_url / 'sub_index'), + } diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index 477c091050..586962eac7 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -239,3 +239,21 @@ async def request_handler(context: ParselCrawlingContext) -> None: def test_default_logger() -> None: assert ParselCrawler().log.name == 'ParselCrawler' + + +async def test_respect_robots_txt(server_url: URL, http_client: HttpClient) -> None: + crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True) + visit = mock.Mock() + + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + visit(context.request.url) + await context.enqueue_links() + + await crawler.run([str(server_url / 'start_enqueue')]) + visited = {call[0][0] for call in visit.call_args_list} + + assert visited == { + str(server_url / 'start_enqueue'), + str(server_url / 'sub_index'), + } diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index aaf8fcaad2..3dbca017a6 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -598,3 +598,21 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: assert crawler.statistics.error_tracker.total == 3 * max_retries assert crawler.statistics.error_tracker.unique_error_count == 2 assert len(kvs_content) == 4 + + +async def test_respect_robots_txt(server_url: URL) -> None: + crawler = PlaywrightCrawler(respect_robots_txt_file=True) + visit = mock.Mock() + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + visit(context.request.url) + await context.enqueue_links() + + await crawler.run([str(server_url / 'start_enqueue')]) + visited = {call[0][0] for call in visit.call_args_list} + + assert visited == { + str(server_url / 'start_enqueue'), + str(server_url / 'sub_index'), + } diff --git a/tests/unit/server.py b/tests/unit/server.py index 29e789d013..21ba01cec8 100644 --- a/tests/unit/server.py +++ b/tests/unit/server.py @@ -11,7 +11,14 @@ from uvicorn.server import Server from yarl import URL -from tests.unit.server_endpoints import GENERIC_RESPONSE, HELLO_WORLD, INCAPSULA, SECONDARY_INDEX, START_ENQUEUE +from tests.unit.server_endpoints import ( + GENERIC_RESPONSE, + HELLO_WORLD, + INCAPSULA, + ROBOTS_TXT, + SECONDARY_INDEX, + START_ENQUEUE, +) if TYPE_CHECKING: from socket import socket @@ -120,6 +127,8 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None: await hello_world_json(send) elif path.startswith('/xml'): await hello_world_xml(send) + elif path.startswith('/robots.txt'): + await robots_txt(send) else: await hello_world(send) @@ -366,6 +375,11 @@ async def dynamic_content(scope: dict[str, Any], send: Send) -> None: await send_html_response(send, html_content=content.encode()) +async def robots_txt(send: Send) -> None: + """Handle requests for the robots.txt file.""" + await send_html_response(send, ROBOTS_TXT) + + class TestServer(Server): """A test HTTP server implementation based on Uvicorn Server.""" diff --git a/tests/unit/server_endpoints.py b/tests/unit/server_endpoints.py index 00456d3dcd..a9f48e6e47 100644 --- a/tests/unit/server_endpoints.py +++ b/tests/unit/server_endpoints.py @@ -41,3 +41,20 @@ <body> Insightful content </body></html>""" + + +ROBOTS_TXT = b"""\ +User-agent: * +Disallow: *deny_all/ +Disallow: /page_ +crawl-delay: 10 + +User-agent: Googlebot +Disallow: *deny_googlebot/ +crawl-delay: 1 + +user-agent: Mozilla +crawl-delay: 2 + +sitemap: http://not-exists.com/sitemap_1.xml +sitemap: http://not-exists.com/sitemap_2.xml""" diff --git a/uv.lock b/uv.lock index 392d5b63fa..77035eff2c 100644 --- a/uv.lock +++ b/uv.lock @@ -600,7 +600,7 @@ toml = [ [[package]] name = "crawlee" -version = "0.6.7" +version = "0.6.8" source = { editable = "." } dependencies = [ { name = "apify-fingerprint-datapoints" }, @@ -610,6 +610,7 @@ dependencies = [ { name = "eval-type-backport" }, { name = "httpx", extra = ["brotli", "http2", "zstd"] }, { name = "more-itertools" }, + { name = "protego" }, { name = "psutil" }, { name = "pydantic" }, { name = "pydantic-settings" }, @@ -711,6 +712,7 @@ requires-dist = [ { name = "playwright", marker = "extra == 'adaptive-crawler'", specifier = ">=1.27.0" }, { name = "playwright", marker = "extra == 'all'", specifier = ">=1.27.0" }, { name = "playwright", marker = "extra == 'playwright'", specifier = ">=1.27.0" }, + { name = "protego", specifier = ">=0.4.0" }, { name = "psutil", specifier = ">=6.0.0" }, { name = "pydantic", specifier = ">=2.8.0,!=2.10.0,!=2.10.1,!=2.10.2" }, { name = "pydantic-settings", specifier = ">=2.2.0,<2.7.0" }, @@ -1950,6 +1952,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b8/d3/c3cb8f1d6ae3b37f83e1de806713a9b3642c5895f0215a62e1a4bd6e5e34/propcache-0.3.1-py3-none-any.whl", hash = "sha256:9a8ecf38de50a7f518c21568c80f985e776397b902f1ce0b01f799aba1608b40", size = 12376, upload_time = "2025-03-26T03:06:10.5Z" }, ] +[[package]] +name = "protego" +version = "0.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4e/6b/84e878d0567dfc11538bad6ce2595cee7ae0c47cf6bf7293683c9ec78ef8/protego-0.4.0.tar.gz", hash = "sha256:93a5e662b61399a0e1f208a324f2c6ea95b23ee39e6cbf2c96246da4a656c2f6", size = 3246425, upload_time = "2025-01-17T15:48:21.644Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/fd/8d84d75832b0983cecf3aff7ae48362fe96fc8ab6ebca9dcf3cefd87e79c/Protego-0.4.0-py2.py3-none-any.whl", hash = "sha256:37640bc0ebe37572d624453a21381d05e9d86e44f89ff1e81794d185a0491666", size = 8553, upload_time = "2025-01-17T15:48:18.332Z" }, +] + [[package]] name = "proxy-py" version = "2.4.10"