Skip to content

feat: Handle request list user input #326

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ indent-style = "space"
docstring-quotes = "double"
inline-quotes = "single"

[tool.ruff.lint.flake8-type-checking]
runtime-evaluated-base-classes = ["pydantic.BaseModel", "crawlee.configuration.Configuration"]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even though crawlee.configuration.Configuration inherits from pydantic_settings.BaseSettings and that one from pydantic.BaseModel, ruff has some issues in following inheritance hierarchy too far. So until that changes, some models will have to be explicitly mentioned even though they inherit from pydantic.BaseModel.
See closed issue where this is described as known limitation to certain degree:
astral-sh/ruff#8725


[tool.ruff.lint.flake8-builtins]
builtins-ignorelist = ["id"]

Expand Down
3 changes: 2 additions & 1 deletion src/apify/_actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from lazy_object_proxy import Proxy
from pydantic import AliasChoices
from typing_extensions import Self

from apify_client import ApifyClientAsync
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
Expand All @@ -31,6 +30,8 @@
import logging
from types import TracebackType

from typing_extensions import Self

from crawlee.proxy_configuration import _NewUrlFunction

from apify._models import Webhook
Expand Down
1 change: 0 additions & 1 deletion src/apify/_configuration.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# ruff: noqa: TCH001 TCH002 TCH003 (so that pydantic annotations work)
from __future__ import annotations

from datetime import datetime, timedelta
Expand Down
1 change: 0 additions & 1 deletion src/apify/_models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# ruff: noqa: TCH001 TCH002 TCH003 (Pydantic)
from __future__ import annotations

from datetime import datetime, timedelta
Expand Down
2 changes: 1 addition & 1 deletion src/apify/_platform_event_manager.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import asyncio
from datetime import datetime # noqa: TCH003
from datetime import datetime
from typing import TYPE_CHECKING, Annotated, Any, Literal, Union

import websockets.client
Expand Down
6 changes: 4 additions & 2 deletions src/apify/scrapy/middlewares/apify_proxy.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from urllib.parse import ParseResult, urlparse

try:
from scrapy import Request, Spider # noqa: TCH002
if TYPE_CHECKING:
from scrapy import Request, Spider
from scrapy.crawler import Crawler
from scrapy.core.downloader.handlers.http11 import TunnelError
from scrapy.crawler import Crawler # noqa: TCH002
from scrapy.exceptions import NotConfigured
except ImportError as exc:
raise ImportError(
Expand Down
5 changes: 4 additions & 1 deletion src/apify/scrapy/pipelines/actor_dataset_push.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from itemadapter.adapter import ItemAdapter

try:
from scrapy import Item, Spider # noqa: TCH002
if TYPE_CHECKING:
from scrapy import Item, Spider
except ImportError as exc:
raise ImportError(
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
Expand Down
5 changes: 4 additions & 1 deletion src/apify/scrapy/scheduler.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
from __future__ import annotations

import traceback
from typing import TYPE_CHECKING

from apify._configuration import Configuration
from apify.apify_storage_client import ApifyStorageClient

try:
from scrapy import Spider
from scrapy.core.scheduler import BaseScheduler
from scrapy.http.request import Request # noqa: TCH002
from scrapy.utils.reactor import is_asyncio_reactor_installed

if TYPE_CHECKING:
from scrapy.http.request import Request
except ImportError as exc:
raise ImportError(
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
Expand Down
5 changes: 4 additions & 1 deletion src/apify/scrapy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@

import asyncio
from base64 import b64encode
from typing import TYPE_CHECKING
from urllib.parse import unquote

from apify_shared.utils import ignore_docs

try:
from scrapy.settings import Settings # noqa: TCH002
from scrapy.utils.project import get_project_settings
from scrapy.utils.python import to_bytes

if TYPE_CHECKING:
from scrapy.settings import Settings
except ImportError as exc:
raise ImportError(
'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run '
Expand Down
4 changes: 3 additions & 1 deletion src/apify/storages/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from crawlee.storages import Dataset, KeyValueStore, RequestQueue

__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue']
from ._request_list import RequestList

__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue', 'RequestList']
150 changes: 150 additions & 0 deletions src/apify/storages/_request_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
from __future__ import annotations

import asyncio
import re
from asyncio import Task
from functools import partial
from typing import Annotated, Any, Union

from pydantic import BaseModel, Field, TypeAdapter

from crawlee import Request
from crawlee._types import HttpMethod
from crawlee.http_clients import BaseHttpClient, HttpxHttpClient
from crawlee.storages import RequestList as CrawleeRequestList

from apify._utils import docs_group

URL_NO_COMMAS_REGEX = re.compile(
r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?'
)


class _RequestDetails(BaseModel):
method: HttpMethod = 'GET'
payload: str = ''
headers: Annotated[dict[str, str], Field(default_factory=dict)] = {}
user_data: Annotated[dict[str, str], Field(default_factory=dict, alias='userData')] = {}


class _RequestsFromUrlInput(_RequestDetails):
requests_from_url: str = Field(alias='requestsFromUrl')


class _SimpleUrlInput(_RequestDetails):
url: str


url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]])


@docs_group('Classes')
class RequestList(CrawleeRequestList):
"""Extends crawlee RequestList.

Method open is used to create RequestList from actor's requestListSources input.
"""

@staticmethod
async def open(
name: str | None = None,
request_list_sources_input: list[dict[str, Any]] | None = None,
http_client: BaseHttpClient | None = None,
) -> RequestList:
"""Creates RequestList from Actor input requestListSources.

Args:
name: Name of the returned RequestList.
request_list_sources_input: List of dicts with either url key or requestsFromUrl key.
http_client: Client that will be used to send get request to urls defined by value of requestsFromUrl keys.

Returns:
RequestList created from request_list_sources_input.

### Usage

```python
example_input = [
# Gather urls from response body.
{'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
# Directly include this url.
{'url': 'https://crawlee.dev', 'method': 'GET'}
]
request_list = await RequestList.open(request_list_sources_input=example_input)
```
"""
request_list_sources_input = request_list_sources_input or []
return await RequestList._create_request_list(name, request_list_sources_input, http_client)

@staticmethod
async def _create_request_list(
name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: BaseHttpClient | None
) -> RequestList:
if not http_client:
http_client = HttpxHttpClient()

url_inputs = url_input_adapter.validate_python(request_list_sources_input)

simple_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _SimpleUrlInput)]
remote_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _RequestsFromUrlInput)]

simple_url_requests = RequestList._create_requests_from_input(simple_url_inputs)
remote_url_requests = await RequestList._fetch_requests_from_url(remote_url_inputs, http_client=http_client)

return RequestList(name=name, requests=simple_url_requests + remote_url_requests)

@staticmethod
def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]:
return [
Request.from_url(
method=request_input.method,
url=request_input.url,
payload=request_input.payload.encode('utf-8'),
headers=request_input.headers,
user_data=request_input.user_data,
)
for request_input in simple_url_inputs
]

@staticmethod
async def _fetch_requests_from_url(
remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient
) -> list[Request]:
"""Crete list of requests from url.

Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Run extracting
callback on each response body and use URL_NO_COMMAS_REGEX regex to find all links. Create list of Requests from
collected links and additional inputs stored in other attributes of each remote_url_requests_inputs.
"""
created_requests: list[Request] = []

def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
"""Callback to scrape response body with regexp and create Requests from matches."""
matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
created_requests.extend(
[
Request.from_url(
match.group(0),
method=request_input.method,
payload=request_input.payload.encode('utf-8'),
headers=request_input.headers,
user_data=request_input.user_data,
)
for match in matches
]
)

remote_url_requests = []
for remote_url_requests_input in remote_url_requests_inputs:
get_response_task = asyncio.create_task(
http_client.send_request(
method='GET',
url=remote_url_requests_input.requests_from_url,
)
)

get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input))
remote_url_requests.append(get_response_task)

await asyncio.gather(*remote_url_requests)
return created_requests
Loading