diff --git a/pyproject.toml b/pyproject.toml index 0d41756e..bda4c509 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -141,6 +141,9 @@ indent-style = "space" docstring-quotes = "double" inline-quotes = "single" +[tool.ruff.lint.flake8-type-checking] +runtime-evaluated-base-classes = ["pydantic.BaseModel", "crawlee.configuration.Configuration"] + [tool.ruff.lint.flake8-builtins] builtins-ignorelist = ["id"] diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 4920c593..5df16eb2 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -8,7 +8,6 @@ from lazy_object_proxy import Proxy from pydantic import AliasChoices -from typing_extensions import Self from apify_client import ApifyClientAsync from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars @@ -31,6 +30,8 @@ import logging from types import TracebackType + from typing_extensions import Self + from crawlee.proxy_configuration import _NewUrlFunction from apify._models import Webhook diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 306e9d63..bfe3ec13 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -1,4 +1,3 @@ -# ruff: noqa: TCH001 TCH002 TCH003 (so that pydantic annotations work) from __future__ import annotations from datetime import datetime, timedelta diff --git a/src/apify/_models.py b/src/apify/_models.py index d4450790..ed66bf2f 100644 --- a/src/apify/_models.py +++ b/src/apify/_models.py @@ -1,4 +1,3 @@ -# ruff: noqa: TCH001 TCH002 TCH003 (Pydantic) from __future__ import annotations from datetime import datetime, timedelta diff --git a/src/apify/_platform_event_manager.py b/src/apify/_platform_event_manager.py index c0c8b67b..bbc993ab 100644 --- a/src/apify/_platform_event_manager.py +++ b/src/apify/_platform_event_manager.py @@ -1,7 +1,7 @@ from __future__ import annotations import asyncio -from datetime import datetime # noqa: TCH003 +from datetime import datetime from typing import TYPE_CHECKING, Annotated, Any, Literal, Union import websockets.client diff --git a/src/apify/scrapy/middlewares/apify_proxy.py b/src/apify/scrapy/middlewares/apify_proxy.py index 3a7f7b75..b1dc2b88 100644 --- a/src/apify/scrapy/middlewares/apify_proxy.py +++ b/src/apify/scrapy/middlewares/apify_proxy.py @@ -1,11 +1,13 @@ from __future__ import annotations +from typing import TYPE_CHECKING from urllib.parse import ParseResult, urlparse try: - from scrapy import Request, Spider # noqa: TCH002 + if TYPE_CHECKING: + from scrapy import Request, Spider + from scrapy.crawler import Crawler from scrapy.core.downloader.handlers.http11 import TunnelError - from scrapy.crawler import Crawler # noqa: TCH002 from scrapy.exceptions import NotConfigured except ImportError as exc: raise ImportError( diff --git a/src/apify/scrapy/pipelines/actor_dataset_push.py b/src/apify/scrapy/pipelines/actor_dataset_push.py index 15026475..d2d983cc 100644 --- a/src/apify/scrapy/pipelines/actor_dataset_push.py +++ b/src/apify/scrapy/pipelines/actor_dataset_push.py @@ -1,9 +1,12 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from itemadapter.adapter import ItemAdapter try: - from scrapy import Item, Spider # noqa: TCH002 + if TYPE_CHECKING: + from scrapy import Item, Spider except ImportError as exc: raise ImportError( 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index 849e5376..da79ac64 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -1,6 +1,7 @@ from __future__ import annotations import traceback +from typing import TYPE_CHECKING from apify._configuration import Configuration from apify.apify_storage_client import ApifyStorageClient @@ -8,8 +9,10 @@ try: from scrapy import Spider from scrapy.core.scheduler import BaseScheduler - from scrapy.http.request import Request # noqa: TCH002 from scrapy.utils.reactor import is_asyncio_reactor_installed + + if TYPE_CHECKING: + from scrapy.http.request import Request except ImportError as exc: raise ImportError( 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', diff --git a/src/apify/scrapy/utils.py b/src/apify/scrapy/utils.py index f22a60de..1f92d4ff 100644 --- a/src/apify/scrapy/utils.py +++ b/src/apify/scrapy/utils.py @@ -2,14 +2,17 @@ import asyncio from base64 import b64encode +from typing import TYPE_CHECKING from urllib.parse import unquote from apify_shared.utils import ignore_docs try: - from scrapy.settings import Settings # noqa: TCH002 from scrapy.utils.project import get_project_settings from scrapy.utils.python import to_bytes + + if TYPE_CHECKING: + from scrapy.settings import Settings except ImportError as exc: raise ImportError( 'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run ' diff --git a/src/apify/storages/__init__.py b/src/apify/storages/__init__.py index 2ed85e84..63ac7af6 100644 --- a/src/apify/storages/__init__.py +++ b/src/apify/storages/__init__.py @@ -1,3 +1,5 @@ from crawlee.storages import Dataset, KeyValueStore, RequestQueue -__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue'] +from ._request_list import RequestList + +__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue', 'RequestList'] diff --git a/src/apify/storages/_request_list.py b/src/apify/storages/_request_list.py new file mode 100644 index 00000000..2dd381fa --- /dev/null +++ b/src/apify/storages/_request_list.py @@ -0,0 +1,150 @@ +from __future__ import annotations + +import asyncio +import re +from asyncio import Task +from functools import partial +from typing import Annotated, Any, Union + +from pydantic import BaseModel, Field, TypeAdapter + +from crawlee import Request +from crawlee._types import HttpMethod +from crawlee.http_clients import BaseHttpClient, HttpxHttpClient +from crawlee.storages import RequestList as CrawleeRequestList + +from apify._utils import docs_group + +URL_NO_COMMAS_REGEX = re.compile( + r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?' +) + + +class _RequestDetails(BaseModel): + method: HttpMethod = 'GET' + payload: str = '' + headers: Annotated[dict[str, str], Field(default_factory=dict)] = {} + user_data: Annotated[dict[str, str], Field(default_factory=dict, alias='userData')] = {} + + +class _RequestsFromUrlInput(_RequestDetails): + requests_from_url: str = Field(alias='requestsFromUrl') + + +class _SimpleUrlInput(_RequestDetails): + url: str + + +url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]]) + + +@docs_group('Classes') +class RequestList(CrawleeRequestList): + """Extends crawlee RequestList. + + Method open is used to create RequestList from actor's requestListSources input. + """ + + @staticmethod + async def open( + name: str | None = None, + request_list_sources_input: list[dict[str, Any]] | None = None, + http_client: BaseHttpClient | None = None, + ) -> RequestList: + """Creates RequestList from Actor input requestListSources. + + Args: + name: Name of the returned RequestList. + request_list_sources_input: List of dicts with either url key or requestsFromUrl key. + http_client: Client that will be used to send get request to urls defined by value of requestsFromUrl keys. + + Returns: + RequestList created from request_list_sources_input. + + ### Usage + + ```python + example_input = [ + # Gather urls from response body. + {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'}, + # Directly include this url. + {'url': 'https://crawlee.dev', 'method': 'GET'} + ] + request_list = await RequestList.open(request_list_sources_input=example_input) + ``` + """ + request_list_sources_input = request_list_sources_input or [] + return await RequestList._create_request_list(name, request_list_sources_input, http_client) + + @staticmethod + async def _create_request_list( + name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: BaseHttpClient | None + ) -> RequestList: + if not http_client: + http_client = HttpxHttpClient() + + url_inputs = url_input_adapter.validate_python(request_list_sources_input) + + simple_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _SimpleUrlInput)] + remote_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _RequestsFromUrlInput)] + + simple_url_requests = RequestList._create_requests_from_input(simple_url_inputs) + remote_url_requests = await RequestList._fetch_requests_from_url(remote_url_inputs, http_client=http_client) + + return RequestList(name=name, requests=simple_url_requests + remote_url_requests) + + @staticmethod + def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]: + return [ + Request.from_url( + method=request_input.method, + url=request_input.url, + payload=request_input.payload.encode('utf-8'), + headers=request_input.headers, + user_data=request_input.user_data, + ) + for request_input in simple_url_inputs + ] + + @staticmethod + async def _fetch_requests_from_url( + remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient + ) -> list[Request]: + """Crete list of requests from url. + + Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Run extracting + callback on each response body and use URL_NO_COMMAS_REGEX regex to find all links. Create list of Requests from + collected links and additional inputs stored in other attributes of each remote_url_requests_inputs. + """ + created_requests: list[Request] = [] + + def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None: + """Callback to scrape response body with regexp and create Requests from matches.""" + matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8')) + created_requests.extend( + [ + Request.from_url( + match.group(0), + method=request_input.method, + payload=request_input.payload.encode('utf-8'), + headers=request_input.headers, + user_data=request_input.user_data, + ) + for match in matches + ] + ) + + remote_url_requests = [] + for remote_url_requests_input in remote_url_requests_inputs: + get_response_task = asyncio.create_task( + http_client.send_request( + method='GET', + url=remote_url_requests_input.requests_from_url, + ) + ) + + get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input)) + remote_url_requests.append(get_response_task) + + await asyncio.gather(*remote_url_requests) + return created_requests diff --git a/tests/unit/actor/test_request_list.py b/tests/unit/actor/test_request_list.py new file mode 100644 index 00000000..4f4c75ac --- /dev/null +++ b/tests/unit/actor/test_request_list.py @@ -0,0 +1,212 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass +from typing import Any, get_args + +import pytest +import respx +from httpx import Response + +from crawlee._request import UserData +from crawlee._types import HttpMethod + +from apify.storages._request_list import URL_NO_COMMAS_REGEX, RequestList + + +@pytest.mark.parametrize('request_method', get_args(HttpMethod)) +@pytest.mark.parametrize( + 'optional_input', + [ + {}, + { + 'payload': 'some payload', + 'userData': {'some key': 'some value'}, + 'headers': {'h1': 'v1', 'h2': 'v2'}, + }, + ], + ids=['minimal', 'all_options'], +) +async def test_request_list_open_request_types(request_method: HttpMethod, optional_input: dict[str, Any]) -> None: + """Test proper request list generation from both minimal and full inputs for all method types for simple input.""" + minimal_request_dict_input = { + 'url': 'https://www.abc.com', + 'method': request_method, + } + request_dict_input = {**minimal_request_dict_input, **optional_input} + + request_list = await RequestList.open(request_list_sources_input=[request_dict_input]) + assert not await request_list.is_empty() + request = await request_list.fetch_next_request() + assert request is not None + assert await request_list.is_empty() + + assert request.method == request_dict_input['method'] + assert request.url == request_dict_input['url'] + assert request.payload == request_dict_input.get('payload', '').encode('utf-8') + expected_user_data = UserData() + if 'userData' in optional_input: + for key, value in optional_input['userData'].items(): + expected_user_data[key] = value + assert request.user_data == expected_user_data + assert request.headers.root == optional_input.get('headers', {}) + + +@respx.mock +async def test_request_list_open_from_url_correctly_send_requests() -> None: + """Test that requests are sent to expected urls.""" + request_list_sources_input: list[dict[str, Any]] = [ + { + 'requestsFromUrl': 'https://abc.dev/file.txt', + 'method': 'GET', + }, + { + 'requestsFromUrl': 'https://www.abc.dev/file2', + 'method': 'PUT', + }, + { + 'requestsFromUrl': 'https://www.something.som', + 'method': 'POST', + 'headers': {'key': 'value'}, + 'payload': 'some_payload', + 'userData': {'another_key': 'another_value'}, + }, + ] + + routes = [respx.get(entry['requestsFromUrl']) for entry in request_list_sources_input] + + await RequestList.open(request_list_sources_input=request_list_sources_input) + + for route in routes: + assert route.called + + +@respx.mock +async def test_request_list_open_from_url() -> None: + """Test that create_request_list is correctly reading urls from remote url sources and also from simple input.""" + expected_simple_url = 'https://www.someurl.com' + expected_remote_urls_1 = {'http://www.something.com', 'https://www.somethingelse.com', 'http://www.bla.net'} + expected_remote_urls_2 = {'http://www.ok.com', 'https://www.true-positive.com'} + expected_urls = expected_remote_urls_1 | expected_remote_urls_2 | {expected_simple_url} + + @dataclass + class MockedUrlInfo: + url: str + response_text: str + + mocked_urls = ( + MockedUrlInfo( + 'https://abc.dev/file.txt', + 'blablabla{} more blablabla{} , even more blablabla. {} '.format(*expected_remote_urls_1), + ), + MockedUrlInfo( + 'https://www.abc.dev/file2', + 'some stuff{} more stuff{} www.false_positive.com'.format(*expected_remote_urls_2), + ), + ) + + request_list_sources_input = [ + { + 'requestsFromUrl': mocked_urls[0].url, + 'method': 'GET', + }, + {'url': expected_simple_url, 'method': 'GET'}, + { + 'requestsFromUrl': mocked_urls[1].url, + 'method': 'GET', + }, + ] + for mocked_url in mocked_urls: + respx.get(mocked_url.url).mock(return_value=Response(200, text=mocked_url.response_text)) + + request_list = await RequestList.open(request_list_sources_input=request_list_sources_input) + generated_requests = [] + while request := await request_list.fetch_next_request(): + generated_requests.append(request) + + # Check correctly created requests' urls in request list + assert {generated_request.url for generated_request in generated_requests} == expected_urls + + +@respx.mock +async def test_request_list_open_from_url_additional_inputs() -> None: + """Test that all generated request properties are correctly populated from input values.""" + expected_url = 'https://www.someurl.com' + example_start_url_input: dict[str, Any] = { + 'requestsFromUrl': 'https://crawlee.dev/file.txt', + 'method': 'POST', + 'headers': {'key': 'value'}, + 'payload': 'some_payload', + 'userData': {'another_key': 'another_value'}, + } + + respx.get(example_start_url_input['requestsFromUrl']).mock(return_value=Response(200, text=expected_url)) + + request_list = await RequestList.open(request_list_sources_input=[example_start_url_input]) + request = await request_list.fetch_next_request() + + # Check all properties correctly created for request + assert request + assert request.url == expected_url + assert request.method == example_start_url_input['method'] + assert request.headers.root == example_start_url_input['headers'] + assert request.payload == str(example_start_url_input['payload']).encode('utf-8') + expected_user_data = UserData() + for key, value in example_start_url_input['userData'].items(): + expected_user_data[key] = value + assert request.user_data == expected_user_data + + +async def test_request_list_open_name() -> None: + name = 'some_name' + request_list = await RequestList.open(name=name) + assert request_list.name == name + + +@pytest.mark.parametrize( + 'true_positive', + [ + 'http://www.something.com', + 'https://www.something.net', + 'http://nowww.cz', + 'https://with-hypen.com', + 'http://number1.com', + 'http://www.number.123.abc', + 'http://many.dots.com', + 'http://a.com', + 'http://www.something.com/somethignelse' 'http://www.something.com/somethignelse.txt', + 'http://non-english-chars-áíéåü.com', + 'http://www.port.com:1234', + 'http://username:password@something.else.com', + ], +) +def test_url_no_commas_regex_true_positives(true_positive: str) -> None: + example_string = f'Some text {true_positive} some more text' + matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string)) + assert len(matches) == 1 + assert matches[0].group(0) == true_positive + + +@pytest.mark.parametrize( + 'false_positive', + [ + 'http://www.a', + 'http://a', + 'http://a.a', + 'http://123.456', + 'www.something.com', + 'http:www.something.com', + ], +) +def test_url_no_commas_regex_false_positives(false_positive: str) -> None: + example_string = f'Some text {false_positive} some more text' + matches = list(re.findall(URL_NO_COMMAS_REGEX, example_string)) + assert len(matches) == 0 + + +def test_url_no_commas_regex_multi_line() -> None: + true_positives = ('http://www.something.com', 'http://www.else.com') + example_string = 'Some text {} some more text \n Some new line text {} ...'.format(*true_positives) + matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string)) + assert len(matches) == 2 + assert {match.group(0) for match in matches} == set(true_positives)